Add Wikipedia example

defanator · May 19, 2024 · b742ad5 · b742ad5
1 parent d5c86c9
commit b742ad5
Show file tree

Hide file tree

Showing 7 changed files with 253 additions and 9 deletions.
diff --git a/Makefile b/Makefile
@@ -32,7 +32,7 @@ SHOW_ENV_VARS = \
 	BUILD
 
 help: ## Show help message (list targets)
-	@awk 'BEGIN {FS = ":.*##"; printf "\nTargets:\n"} /^[$$()% 0-9a-zA-Z_-]+:.*?##/ {printf "  \033[36m%-10s\033[0m %s\n", $$1, $$2}' $(SELF)
+	@awk 'BEGIN {FS = ":.*##"; printf "\nTargets:\n"} /^[$$()% 0-9a-zA-Z_-]+:.*?##/ {printf "  \033[36m%-13s\033[0m %s\n", $$1, $$2}' $(SELF)
 
 show-var-%:
 	@{ \
@@ -59,9 +59,18 @@ lint: version ## Run linters
 fmt: version ## Run formatters
 	tox run -e fmt
 
-venv: version ## Create virtualenv
+.venv:
 	tox devenv --list-dependencies .venv
 
+venv: .venv ## Create virtualenv
+
+venv-examples: venv ## Install extra modules for examples
+	$(TOPDIR)/.venv/bin/python3 -m pip install -r $(TOPDIR)/examples/requirements.txt
+
+nltk_data: venv-examples ## Download nltk data required for examples
+	mkdir -p $(TOPDIR)/.venv/nltk_data
+	NLTK_DATA="$(TOPDIR)/.venv/nltk_data" $(TOPDIR)/.venv/bin/python3 -m textblob.download_corpora
+
 clean: ## Clean up
 	find $(TOPDIR)/ -type f -name "*.pyc" -delete
 	find $(TOPDIR)/ -type f -name "*.pyo" -delete
@@ -73,4 +82,4 @@ clean: ## Clean up
 	rm -rf $(TOPDIR)/htmlcov-py*
 	rm -f $(TOPDIR)/VERSION
 
-.PHONY: build test lint fmt venv clean
+.PHONY: build test lint fmt clean
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,101 @@
+# Examples
+
+This directory contains a couple of example programs that demonstrate usage of the wordsearch module:
+
+1. [place_and_discover.py](place_and_discover.py) - simple demo of placing a list of words to a board and running discovery against each word.
+2. [create_board_from_wikipedia.py](create_board_from_wikipedia.py) - extended demo of creating a board from random Wikipedia article.
+
+# Prerequisites
+
+While 1st program is quite simple and does not require any extra dependencies, 2nd one is a bit more complicated.
+It uses a few extra modules under the hood to do various things, in particular:
+
+ - [requests](https://requests.readthedocs.io/en/latest/) to fetch resources from Wikipedia,
+ - [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) to parse HTML pages and extract text from paragraphs,
+ - [NLTK](https://www.nltk.org/) and [TextBlob](https://textblob.readthedocs.io/en/dev/) to look for all common nouns in a given text and put every noun to singular form.
+
+In order to create an isolated environment with all the modules available, please ensure you have Python and [virtualenv](https://virtualenv.pypa.io/en/latest/) installed first and run `make venv venv-examples nltk_data` from project's root directory:
+
+```
+[2024-05-19 08:49:49] defan@defmbp wordsearch % make venv venv-examples nltk_data
+tox devenv --list-dependencies .venv
+[..]
+Installing collected packages: urllib3, tqdm, soupsieve, regex, joblib, idna, click, charset-normalizer, certifi, requests, nltk, beautifulsoup4, textblob
+Successfully installed beautifulsoup4-4.12.3 certifi-2024.2.2 charset-normalizer-3.3.2 click-8.1.7 idna-3.7 joblib-1.4.2 nltk-3.8.1 regex-2024.5.15 requests-2.31.0 soupsieve-2.5 textblob-0.18.0.post0 tqdm-4.66.4 urllib3-2.2.1
+[..]
+mkdir -p /Users/defan/git/wordsearch/.venv/nltk_data
+NLTK_DATA="/Users/defan/git/wordsearch/.venv/nltk_data" /Users/defan/git/wordsearch/.venv/bin/python3 -m textblob.download_corpora
+[nltk_data] Downloading package brown to
+[nltk_data]     /Users/defan/git/wordsearch/.venv/nltk_data...
+[nltk_data]   Unzipping corpora/brown.zip.
+[nltk_data] Downloading package punkt to
+[nltk_data]     /Users/defan/git/wordsearch/.venv/nltk_data...
+[nltk_data]   Unzipping tokenizers/punkt.zip.
+[nltk_data] Downloading package wordnet to
+[nltk_data]     /Users/defan/git/wordsearch/.venv/nltk_data...
+[nltk_data] Downloading package averaged_perceptron_tagger to
+[nltk_data]     /Users/defan/git/wordsearch/.venv/nltk_data...
+[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
+[nltk_data] Downloading package conll2000 to
+[nltk_data]     /Users/defan/git/wordsearch/.venv/nltk_data...
+[nltk_data]   Unzipping corpora/conll2000.zip.
+[nltk_data] Downloading package movie_reviews to
+[nltk_data]     /Users/defan/git/wordsearch/.venv/nltk_data...
+[nltk_data]   Unzipping corpora/movie_reviews.zip.
+Finished.
+```
+
+This should create a [virtualenv](https://virtualenv.pypa.io/en/latest/) (in the `.venv` directory) with all the modules and NLTK data.
+
+# Have some fun!
+
+Once the environment is ready, you should activate it by running `source ./.venv/bin/activate` in your terminal and then run the 2nd example program (note that if you're using VSCode, it would automatically let you use Python interpreter from detected `.venv` virtualenv, so activating might not be explicitly required):
+
+```
+[2024-05-19 08:50:22] defan@defmbp wordsearch % source ./.venv/bin/activate
+
+(.venv) [2024-05-19 09:21:31] defan@defmbp wordsearch % ./examples/create_board_from_wikipedia.py
+Bachelor of Music - Wikipedia (https://en.wikipedia.org/wiki/Bachelor_of_Music)
+
+BANKING
+CAMPUSE
+COMPLETION
+CONSIST
+DEGREE
+FRONTRUNNER
+HISTORY
+MUSICOLOGY
+PREPARATORY
+PROGRAM
+RECIPIENT
+STANDARD
+SUBJECT
+THEORY
+THERAPY
+
+K F U W N C E N Y K M M U J V K E U Y U G N E P H
+V A J R E C I P I E N T K J P U O O Y Q U L N Y U
+Y S Q S G O S W Z M G G X Q J W M G M G A D B S G
+J K H A S R A Y T E M R X P S F L D U L H D S O I
+C O N S I S T F D W G X S X Z D J B A K I T X N H
+X K M E N N Z V T X J C O F I T G S Y M A X U B G
+H Q C P J Z P S N T H O F T D E L M O N J V A U T
+T S N L W Q R I B X W M H V V S R L D K Q N O B D
+E U U Q T Y O X T V Z P U N G E A A T L K I F W Q
+C B H B A F G Z H U W L L H U J R X C I L S Z R T
+C J S F E P R V E T A E N I Y D Y Y N V Z M D P T
+X E K M E R A O O U H T T Y D V L G H P K O Y A N
+K C T Q R E M Z R D M I V I N Y V F M K K X J I R
+J T Y Z C E E T Y S K O M C L D P E Q R Q M T C S
+A S X F X C W T Y D K N J B C K M V X F Z I P I J
+C A M P U S E Z O T Z I W B N Y O W A R D D R Y V
+E I L U L Z U U H Z F F X H O W A D H O K F E V E
+N Z H F N C L E K J U I H J O C D V B N W B P Z A
+S E R Q U W R Z F N N I T H K E D C U T E P A K Z
+H P C Z B A S C R I S F G H G Q R I O R L X R N Y
+X D Q T P Q I S B T L Y U R N R X D I U O T A K R
+N G S Y G E N P O Y S C E D R Z K Y Z N H M T Q W
+P N A D C W I R C L S E T M H J D Y G N M Y O G N
+H M M T B L Y C Y B F U L C A B B L X E D F R R C
+X F X Z W T E J M U S I C O L O G Y C R N F Y C U
+```
diff --git a/examples/create_board_from_wikipedia.py b/examples/create_board_from_wikipedia.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+
+"""
+Example of creating a board and populating it with words from random Wikipedia article.
+"""
+
+import sys
+import random
+import requests
+from bs4 import BeautifulSoup
+from textblob import TextBlob
+from wordsearch import WordsearchBoard
+
+
+__author__ = "Andrei Belov"
+__license__ = "MIT"
+__copyright__ = f"Copyright (c) {__author__}"
+
+
+WIKI_RANDOM_ARTICLE_URL = "https://en.wikipedia.org/wiki/Special:Random"
+
+GRID_SIZE = 25
+NUM_WORDS = 15
+WORD_MIN_LENGTH = 6
+
+
+def get_nouns_from_random_wiki_article(word_min_length=WORD_MIN_LENGTH):
+    """
+    Get all nouns (in singular form) from a random Wikipedia article.
+    """
+    try:
+        page = requests.get(WIKI_RANDOM_ARTICLE_URL, timeout=5)
+    except requests.exceptions.RequestException as exc:
+        print(exc)
+        return None
+
+    soup = BeautifulSoup(page.text, "html.parser")
+    title = soup.find("title").get_text()
+
+    text = ""
+    for para in soup.find_all("p"):
+        text = text + " " + para.get_text()
+
+    tb = TextBlob(text)
+
+    nouns = set()
+    for t in tb.tags:
+        if not t[0].isalpha():
+            continue
+
+        if t[1] in ("NN"):
+            if len(t[0]) < word_min_length:
+                continue
+
+            nouns.add(t[0].upper())
+            continue
+
+        if t[1] in ("NNS"):
+            singular = t[0].singularize()
+            if len(singular) < word_min_length:
+                continue
+
+            nouns.add(singular.upper())
+            continue
+
+    return page.url, title, nouns
+
+
+def hunt_for_words(num_words=NUM_WORDS, word_min_length=WORD_MIN_LENGTH):
+    """
+    Go through random Wikipedia articles and search for a required number
+    of words with a given minimal length.
+    """
+    while True:
+        try:
+            url, title, nouns = get_nouns_from_random_wiki_article(
+                word_min_length=word_min_length
+            )
+        except TypeError:
+            return None
+
+        if len(nouns) < num_words:
+            continue
+
+        break
+
+    return url, title, random.sample(list(nouns), num_words)
+
+
+def main():
+    """
+    Entrypoint.
+    """
+    try:
+        url, title, nouns = hunt_for_words()
+    except TypeError:
+        return 1
+
+    print(f"{title} ({url})\n")
+
+    ws = WordsearchBoard(width=GRID_SIZE, height=GRID_SIZE)
+
+    # chances are higher if we go from longest word down to shortest
+    for w in sorted(nouns, key=len, reverse=True):
+        if not ws.place_word(w):
+            print(f"failed to place word: {w}")
+            return 1
+
+    for w in sorted(nouns):
+        print(w)
+
+    print()
+    ws.mask_grid()
+    ws.print_board()
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/place_and_discover.py b/examples/place_and_discover.py
@@ -2,12 +2,16 @@
 # vim:sw=4:ts=4:et:
 
 """
-Example of creating a board, placing some words, and searching for those
+Example of creating a board, placing some words, and searching for those.
 """
 
 import sys
 from wordsearch import WordsearchBoard
 
+__author__ = "Andrei Belov"
+__license__ = "MIT"
+__copyright__ = f"Copyright (c) {__author__}"
+
 
 def main():
     """

diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -0,0 +1,3 @@
+requests>=2.31.0
+beautifulsoup4>=4.12.3
+textblob>=0.18
diff --git a/src/wordsearch/wordsearch.py b/src/wordsearch/wordsearch.py
@@ -92,18 +92,22 @@ def find_xy(self, wlen, max_attempts=1000):
         :return: int x, int y, tuple direction - coordinates and direction for a word
         :return: None if free cell was not found
         """
+        # if word can not fit at all
         if self.width < wlen and self.height < wlen:
             return None
 
-        if wlen <= self.height and wlen <= self.width:
-            direction_choices = [[1, 0], [1, 1], [0, 1], [1, -1]]
-
-        elif self.width < wlen <= self.height:
+        # if word is longer than board's width but can fit vertically
+        if self.width < wlen <= self.height:
             direction_choices = [[1, 0]]
 
+        # if word is longer than board's height but can fit horizontally
         elif self.height < wlen <= self.width:
             direction_choices = [[0, 1]]
 
+        # if word can fit in all directions
+        else:
+            direction_choices = [[1, 0], [1, 1], [0, 1], [1, -1]]
+
         y_spacing = self.height - wlen + 1
         x_spacing = self.width - wlen + 1
 
@@ -115,7 +119,7 @@ def find_xy(self, wlen, max_attempts=1000):
             if direction[0] == 1:
                 y = random.randrange(0, y_spacing) if y_spacing > 0 else 0
 
-            elif direction[0] == 0:
+            else:  # direction[0] == 0:
                 y = random.randrange(0, self.height)
 
             if direction[1] == -1:
@@ -214,6 +218,8 @@ def find(self, word, row, col, i=0, direction="all", res_board=None):
 
         self.grid[row][col] = "*"
 
+        res = False
+
         if direction == "all":
             res = (
                 self.find(word, row + 1, col, i + 1, res_board=res_board)

diff --git a/tox.ini b/tox.ini
@@ -25,6 +25,7 @@ description = run linters
 deps =
     pytest>=6
     pylint>=3.1.0
+    -r examples/requirements.txt
 commands =
     pylint --reports=y src/ tests/ examples/