Skip to content

Commit

Permalink
Merge pull request #8 from samhaswon/optimization
Browse files Browse the repository at this point in the history
Majority C rewrite
  • Loading branch information
samhaswon authored May 15, 2024
2 parents 5434c61 + e60e335 commit 4b01036
Show file tree
Hide file tree
Showing 53 changed files with 1,671 additions and 476 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ dist/
/src/multi_bible_search.egg-info/
*.pyc
/src/multi_bible_search/C/
/src/multi_bible_search/*.pyd
/src/multi_bible_search/*.so
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ If a query is made with no matches, say "notawordinthebible," the result of the

Versions are automatically loaded as needed, but you may wish to preload a version for the sake of speed.

An index of a particular version can be preloaded with the `load()` method. Simply pPass the version identifier as a string to load it.
An index of a particular version can be preloaded with the `load()` method. Simply pass the version identifier as a string to load it.

Remember that capitalization is important when using this method.

You may also preload all versions by simply calling the `load_all()` method.

Finally, you can unload a version with the `unload_version()` method for a particular version.

## Supported Versions

Supported versions can be listed with
Expand Down
41 changes: 39 additions & 2 deletions build_index/build_bible_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ def remove_punctuation(input_string: str) -> str:
:param input_string: String to process
:return: cleaned string
"""
return ''.join(x for x in re.sub(r"[\u0080-\uffef]s?|'s?", "", input_string)
return ''.join(x for x in re.sub(r"\s\s+", " ",
re.sub(
r"[\u0080-\uffef](s?(?=\W))|['\"]s?|[,.:;\-?!]+|\u2014|\\ul\d",
" ", input_string.replace("\u014D", "o")))
if (x.isalpha() or x.isspace()))


Expand All @@ -24,7 +27,7 @@ def tokenize(input_string: str) -> List[str]:
:param input_string: The string to tokenize.
:return: A list of lowercase tokens (words) in the string.
"""
return remove_punctuation(input_string.lower()).split()
return remove_punctuation(input_string.replace("I-chabod", "Ichabod").lower()).split()


def index_bible(bible, name: str, result) -> None:
Expand Down Expand Up @@ -56,6 +59,30 @@ def index_bible(bible, name: str, result) -> None:
tokens = list(set(tokenize(passage[passage.find(" "):])))
reference = translate(book, chapter, verse)
for token in tokens:
if len(token) == 1 and token != "a":
continue
# XML source issue
elif token in {"lxx", "syr", "vg", "heb", "etc", "isha", "ish", "aleph", "kol"}:
continue
# ????
elif token == 'ij':
token = "i"
# Typo in the source?
elif token == "ad":
token = "and"
# Make this the same
elif token == 'aramnaharaim':
if "aram" not in tmp_index:
tmp_index["aram"] = []
if "naharaim" not in tmp_index:
tmp_index["naharaim"] = []
tmp_index["aram"].append(reference)
tmp_index["naharaim"].append(reference)
continue
try:
a = token.encode("ascii")
except:
continue
if token not in tmp_index:
tmp_index[token] = []
tmp_index[token].append(reference)
Expand Down Expand Up @@ -124,6 +151,10 @@ def make_index(bibles: dict) -> dict:
kjv_like = ["AKJV", "GNV", "KJV", "KJV 1611", "RNKJV", "UKJV"]
index = separate_duplicates(index, kjv_like, "KJV-like")

print("Built tertiary index. removing duplicates from the two NIV versions...")
both_niv = ["NIV 1984", "NIV 2011"]
index = separate_duplicates(index, both_niv, "NIV")

return index


Expand Down Expand Up @@ -175,9 +206,15 @@ def save(data: dict, key: str) -> None:

reference_index = make_index(bibles)

key_list = []
# Save each version's index as a separate file to be able to load them independently.
for key in reference_index.keys():
save(reference_index[key], key)
key_list.extend(reference_index[key].keys())

key_list = list(set(key_list))
with open("../keys.txt", "w", encoding="utf-8") as key_file:
key_file.write(''.join(f"{k}\n" for k in key_list))
# How long did this take? Because this takes a while to run.
end = time.perf_counter()
print(f"Index time: {end - start:.4f}")
27 changes: 27 additions & 0 deletions debug_scripts/fully_unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from src.multi_bible_search import BibleSearch
import faulthandler


faulthandler.enable()
searcher = BibleSearch()
searcher.load_all()


with open("version_keys.txt", "r") as token_file:
tokens = token_file.read().splitlines()

unique_tokens = []

for token in tokens:
unique = 0
for version in searcher.versions:
result = searcher.search(token, version)
if len(result):
unique += 1
if unique < 2:
unique_tokens.append(token)

if len(unique_tokens) == 0:
print("No unique tokens")
else:
print("Unique tokens:\n", ''.join(f"{t}, " for t in unique_tokens))
10 changes: 10 additions & 0 deletions debug_scripts/get_keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import bz2
import json

with bz2.open(f"../src/multi_bible_search/data/NIV 2011.json.pbz2", "rt", encoding='utf-8') as data_file:
data = json.load(data_file)

with open("version_keys.txt", "w") as key_file:
keys = list(data.keys())
keys.sort()
key_file.write(''.join(f"{k}\n" for k in keys))
12 changes: 12 additions & 0 deletions debug_scripts/key_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from src.multi_bible_search import BibleSearch

token = "jesus wept"


searcher = BibleSearch()
searcher.load_all()

for version in searcher.versions:
result = searcher.search(token, version)
if len(result):
print(f"Version: {version}; Results {result}")
8 changes: 5 additions & 3 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
build:
./venv/Scripts/activate
py -m build

install:
pip install --force-reinstall ./dist/multi_bible_search-1.0.0.tar.gz
install: dist/multi_bible_search-2.0.0.tar.gz
pip install --force-reinstall ./dist/multi_bible_search-2.0.0.tar.gz
copy venv\\Lib\\site-packages\\multi_bible_search\\*.pyd src\\multi_bible_search\\

full: build install
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "multi_bible_search"
version = "1.0.1"
version = "2.0.0"
authors = [
{ name="Samuel Howard" },
]
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ inplace = True

[ext_modules]
sources =
ctranslate.c
multi_bible_search.c
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
setup(
name='multi_bible_search',
ext_modules=[
Extension('ctranslate.ctranslate', ['src/ctranslate/ctranslate.c']),
Extension('multi_bible_search.multi_bible_search',
['src/multi_bible_search/multi_bible_search.c'],
include_dirs=['src/multi_bible_search/'],
)
],
)
Empty file removed src/ctranslate/__init__.py
Empty file.
Loading

0 comments on commit 4b01036

Please sign in to comment.