-
Notifications
You must be signed in to change notification settings - Fork 100
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
57df731
commit db69e5e
Showing
114 changed files
with
3,952 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,16 +42,16 @@ def create_test_db(cur: psycopg.Cursor) -> None: | |
@pytest.fixture(scope="session", autouse=True) | ||
def set_up_test_db() -> None: | ||
# create a test user and test database owned by the test user | ||
with psycopg.connect(f"postgres://[email protected]:5432/postgres", autocommit=True) as con: | ||
with psycopg.connect("postgres://[email protected]:5432/postgres", autocommit=True) as con: | ||
with con.cursor() as cur: | ||
create_test_user(cur) | ||
create_test_db(cur) | ||
# grant some things to the test user in the test database | ||
with psycopg.connect(f"postgres://[email protected]:5432/test", autocommit=True) as con: | ||
with psycopg.connect("postgres://[email protected]:5432/test", autocommit=True) as con: | ||
with con.cursor() as cur: | ||
cur.execute("grant execute on function pg_read_binary_file(text) to test") | ||
cur.execute("grant pg_read_server_files to test") | ||
# use the test user to create the extension in the test database | ||
with psycopg.connect(f"postgres://[email protected]:5432/test") as con: | ||
with psycopg.connect("postgres://[email protected]:5432/test") as con: | ||
with con.cursor() as cur: | ||
cur.execute("create extension ai cascade") |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[pytest] | ||
python_files = test_*.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
[metadata] | ||
name = vectorizer | ||
version = attr: vectorizer.__version__ | ||
name = pgai | ||
version = attr: pgai.__version__ | ||
|
||
[options] | ||
python_requires = >=3.10 | ||
packages = vectorizer | ||
packages = pgai | ||
install_requires = file: requirements.txt | ||
|
||
[options.entry_points] | ||
console_scripts = | ||
vectorizer = vectorizer.cli:run | ||
vectorizer = pgai.cli:run |
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from dataclasses import dataclass | ||
|
||
|
||
@dataclass | ||
class ConnInfo: | ||
host: str | ||
port: int | ||
role: str | ||
password: str | ||
db_name: str | ||
ssl_mode: str = "require" | ||
|
||
@property | ||
def url(self) -> str: | ||
return f"postgres://{self.role}:{self.password}@{self.host}:{self.port}/{self.db_name}?sslmode={self.ssl_mode}" |
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import asyncio | ||
import logging | ||
import os | ||
from typing import Any | ||
|
||
import structlog | ||
from pydantic import AliasChoices, Field, ValidationError | ||
from pydantic.dataclasses import dataclass | ||
|
||
from . import db | ||
from .env import get_bool_env | ||
from .processing import CloudFunctions | ||
from .secrets import Secrets | ||
from .vectorizer import Vectorizer, Worker | ||
|
||
TIKTOKEN_CACHE_DIR = os.path.join( | ||
os.path.dirname(os.path.abspath(__file__)), "tiktoken_cache" | ||
) | ||
structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO)) | ||
logger = structlog.get_logger() | ||
|
||
|
||
@dataclass | ||
class UpdateEmbeddings: | ||
db: db.ConnInfo | ||
secrets: Secrets | ||
|
||
|
||
@dataclass | ||
class Event: | ||
update_embeddings: UpdateEmbeddings | ||
vectorizer: Vectorizer = Field(validation_alias=AliasChoices("payload")) | ||
|
||
|
||
async def run_workers( | ||
concurrency: int, | ||
conn_info: db.ConnInfo, | ||
vectorizer: Vectorizer, | ||
) -> list[int]: | ||
"""Runs the embedding tasks and wait for them to finish.""" | ||
# TODO: handle timeout so that lambdas are not killed by AWS | ||
tasks = [ | ||
asyncio.create_task(Worker(conn_info.url, vectorizer).run()) | ||
for _ in range(concurrency) | ||
] | ||
return await asyncio.gather(*tasks) | ||
|
||
|
||
def set_log_level(cf: CloudFunctions): | ||
mapping = logging.getLevelNamesMapping() | ||
if cf.log_level != "INFO" and cf.log_level in mapping: | ||
structlog.configure( | ||
wrapper_class=structlog.make_filtering_bound_logger(mapping[cf.log_level]) | ||
) | ||
|
||
|
||
def lambda_handler(raw_event: dict[str, Any], _: Any) -> dict[str, int]: | ||
"""Lambda entry point. Validates the config given via the event, and | ||
starts the embedding tasks. | ||
Args: | ||
raw_event (dict): maps to the `Event` dataclass. | ||
""" | ||
try: | ||
event = Event(**raw_event) | ||
except ValidationError as e: | ||
raise e | ||
|
||
# The type error we are ignoring is because there's only one type available | ||
# for Config.processing. We keep the check to signal intent, in case we add | ||
# other types in the future. | ||
if isinstance(event.vectorizer.config.processing, CloudFunctions): # type: ignore | ||
set_log_level(event.vectorizer.config.processing) | ||
|
||
event.vectorizer.config.embedding.set_api_key(event.update_embeddings.secrets) | ||
|
||
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR | ||
results = asyncio.run( | ||
run_workers( | ||
event.vectorizer.config.processing.concurrency, | ||
event.update_embeddings.db, | ||
event.vectorizer, | ||
) | ||
) | ||
return {"statusCode": 200, "processed_tasks": sum(results)} |
File renamed without changes.
File renamed without changes.
Oops, something went wrong.