Skip to content

Commit

Permalink
Merge pull request #19 from small-thinking/add-notion-query
Browse files Browse the repository at this point in the history
1Add query page function
  • Loading branch information
yxjiang authored Feb 13, 2024
2 parents 36a65ec + 92e41f8 commit 34c85f7
Show file tree
Hide file tree
Showing 10 changed files with 207 additions and 39 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.11
python-version: '3.10'

- name: Install dependencies
run: |
Expand All @@ -43,7 +43,7 @@ jobs:
- name: Install lint dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 mypy==1.1.1 isort black
pip install flake8 mypy==1.1.1 isort black==23.12.0
- name: Run black
run: |
black --check .
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pymemgpt = "^0.2.7"

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.3"
black = "^23.11.0"
black = "23.12.0"
flake8 = "^6.1.0"
isort = "^5.12.0"
mypy = "^1.7.1"
Expand Down
91 changes: 73 additions & 18 deletions wenling/archiver.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
"""
"""
import asyncio

import json
import os
import re
from abc import ABC, abstractmethod
from typing import Any, Dict, List

from bs4 import BeautifulSoup, Tag
from dotenv import load_dotenv # type: ignore

from wenling.common.model_utils import OpenAIChatModel
from wenling.common.notion_utils import NotionStorage
from wenling.common.notion_storage import NotionStorage
from wenling.common.utils import *


Expand All @@ -27,26 +28,26 @@ def __init__(self, verbose: bool = False):
]
self.default_archiver = WebPageArchiver(verbose=verbose)

async def archive(self, url: str) -> str:
async def archive(self, url: str, notes: Optional[str] = None) -> str:
"""Match the url with pattern and find the corresponding archiver."""
for archiver in self.archivers:
if re.match(pattern=archiver["match_regex"], string=url):
if self.verbose:
self.logger.info(f"Archive url with archiver {archiver['archiver'].name}...")
page_id = await archiver["archiver"].archive(url)
self.logger.info(f"Archive url with archiver {archiver['archiver'].name} with notes {notes}...")
page_id = await archiver["archiver"].archive(url=url, notes=notes)
return page_id
# Match to general web archiver by default.
if self.verbose:
self.logger.info(f"Archive url with archiver general web archiver...")
page_id = await self.default_archiver.archive(url)
page_id = await self.default_archiver.archive(url=url, notes=notes)
return page_id


class Archiver(ABC):
"""Archiver is a tool used to archive the bookmarked articles."""

def __init__(self, vendor_type: str = "openai", verbose: bool = False, **kewargs):
load_env()
def __init__(self, vendor_type: str = "openai", verbose: bool = False, **kwargs):
load_dotenv(override=True)
self.api_key = os.getenv("ARCHIVER_API_KEY")
self.verbose = verbose
self.logger = Logger(logger_name=os.path.basename(__file__), verbose=verbose)
Expand All @@ -61,6 +62,40 @@ def __init__(self, vendor_type: str = "openai", verbose: bool = False, **kewargs
def _extra_setup(self):
pass

async def _auto_tagging(self, paragraphs: List[Dict[str, Any]]) -> List[str]:
"""Leverage the LLM to auto-generate the tags based on the contents."""
contents_str = "\n".join([paragraph.get("text", "") for paragraph in paragraphs])
prompt = f"""
Please help generate the tags based on the contents below:
---
{contents_str}
---
Some suggested tags:
1. If this article is about building agent, please add the tag Agent.
2. If this article is about LLM, please add the tag LLM.
3. If this article is about deep learning in general, please add the tag Deep Learning.
4. If this article is about tech philosophy, please add the tag Tech Philosophy.
5. Please use any other tags that you think are relevant.
Please generate the tags in the same language as the contents, and return in below json format:
{{
"tags": ["tag1", "tag2", "tag3"]
}}
"""
json_response_str = self.model.inference(
user_prompt=prompt, max_tokens=256, temperature=0.0, response_format="json_object"
)
try:
json_obj = json.loads(json_response_str)
tags = json_obj.get("tags", [])
if self.verbose:
self.logger.info(f"Auto-generated tags: {tags}")
return tags
except Exception as e:
self.logger.error(f"Error parsing the tags. Details: {str(e)}. Return empty tags.")
return []

@abstractmethod
def _set_name(self) -> str:
pass
Expand All @@ -83,14 +118,14 @@ def _consolidate_content(self, content: List[Dict[str, Any]]) -> List[Dict[str,
consolidated_content.append(block)
return consolidated_content

async def archive(self, url: str) -> str:
async def archive(self, url: str, notes: Optional[str] = None) -> str:
if not check_url_exists(url):
raise ValueError(f"The url {url} does not exist.")
article_json_obj = await self._archive(url)
article_json_obj = await self._archive(url=url, notes=notes)
return await self.notion_store.store(json_obj=article_json_obj)

@abstractmethod
async def _archive(self, url: str) -> Dict[str, Any]:
async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any]:
pass

def list_archived(self) -> List[str]:
Expand Down Expand Up @@ -123,10 +158,13 @@ def _parse_author(self, element_bs: BeautifulSoup) -> str:
"""Get the author name from a sub element with class "rich_media_meta rich_media_meta_text",
put it into {"type": "h2", "text": <author_name>}.
"""
author = ""
author_element = element_bs.select_one(".rich_media_meta.rich_media_meta_text")
if not author_element:
raise ValueError("Cannot find author element.")
author = author_element.get_text().strip()
if self.verbose:
self.logger.warning("Cannot find author element.")
else:
author = author_element.get_text().strip()
return author

def _parse_publish_time(self, element_bs: BeautifulSoup) -> Dict[str, str]:
Expand All @@ -143,10 +181,13 @@ def _parse_tags(self, element_bs: BeautifulSoup) -> List[str]:
"""Get the tags from a sub elements (not direct sub) each with class "article-tag__item",
and put them into {"type": "text", "text": <comma separated tags>}
"""
tags = []
tags_element = element_bs.select(".article-tag__item")
if not tags_element:
raise ValueError("Cannot find tags element.")
tags = [tag.get_text().strip() for tag in tags_element]
if self.verbose:
self.logger.warning("Cannot find tags element.")
else:
tags = [tag.get_text().strip() for tag in tags_element]
return tags

def _parse_paragraph(self, paragraph_tag: Tag, cache: Dict[str, Any]) -> List[Dict[str, str]]:
Expand Down Expand Up @@ -271,7 +312,7 @@ def _parse_content(self, element_bs: BeautifulSoup) -> List[Dict[str, Any]]:

return content_json_obj

async def _archive(self, url: str) -> Dict[str, Any]:
async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any]:
"""Get the content block from the web page with the path div#img-content.rich_media_wrp.
Parse the elements and put them into a json object with list of elements.
"""
Expand All @@ -288,16 +329,26 @@ async def _archive(self, url: str) -> Dict[str, Any]:
"children": paragraphs,
}
article_json_obj["properties"]["url"] = url
article_json_obj["properties"]["notes"] = notes if notes else ""
article_json_obj["properties"]["title"] = self._parse_title(element_bs=element_bs)
article_json_obj["properties"]["type"] = "微信"
# Convert date time to needed format.
article_json_obj["properties"]["datetime"] = get_datetime()
tags = self._parse_tags(element_bs=element_bs) + [self._parse_author(element_bs=element_bs)]
tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
if tags:
tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
else:
tags = []
self.logger.info(f"Auto-generate tags based on the contents...")
auto_tags = await self._auto_tagging(paragraphs)
tags.extend(auto_tags)
article_json_obj["properties"]["tags"] = tags

if self.verbose:
json_object_str = json.dumps(article_json_obj, indent=2)
self.logger.info(f"Archived article: {json_object_str}")
except Exception as e:
self.logger.error(f"Error parsing content. Details: {str(e)}")
raise ValueError(f"Error parsing content. Details: {str(e)}")
finally:
return article_json_obj
Expand Down Expand Up @@ -468,7 +519,7 @@ async def _parse_tags(self, paragraphs: List[Dict[str, Any]]) -> List[str]:
self.logger.error(f"Error parsing the tags. Details: {str(e)}. Return empty tags.")
return []

async def _archive(self, url: str) -> Dict[str, Any]:
async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any]:
"""Get the content block from the web page.
Parse the elements and put them into a json object with list of elements.
"""
Expand All @@ -484,6 +535,7 @@ async def _archive(self, url: str) -> Dict[str, Any]:
}
try:
article_json_obj["properties"]["url"] = url
article_json_obj["properties"]["notes"] = notes if notes else ""
article_json_obj["properties"]["title"] = self._parse_title(element_bs=element_bs)
article_json_obj["properties"]["type"] = "网页"
article_json_obj["properties"]["datetime"] = get_datetime()
Expand All @@ -492,6 +544,9 @@ async def _archive(self, url: str) -> Dict[str, Any]:
# Leverage LLM to generate the tags based on the article json obj contents.
tags = await self._parse_tags(contents)
tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
self.logger.info(f"Auto-generate tags based on the contents...")
auto_tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
tags += auto_tags
article_json_obj["properties"]["tags"] = tags
if self.verbose:
json_object_str = json.dumps(article_json_obj, indent=2)
Expand Down
5 changes: 3 additions & 2 deletions wenling/common/image_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
import flickrapi
import requests
import retrying
from dotenv import load_dotenv # type: ignore
from flickrapi.auth import FlickrAccessToken

from wenling.common.utils import Logger, load_env
from wenling.common.utils import Logger


async def upload_image_to_flickr(
Expand All @@ -34,7 +35,7 @@ async def upload_image_to_flickr(
Raises:
- Exception: If there is an error in uploading or parsing the response.
"""
load_env()
load_dotenv(override=True)

api_key = os.environ.get("FLICKR_API_KEY")
api_secret = os.environ.get("FLICKR_SECRET")
Expand Down
5 changes: 3 additions & 2 deletions wenling/common/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

import openai
import retrying
from dotenv import load_dotenv # type: ignore

from wenling.common.utils import Logger, load_env
from wenling.common.utils import Logger


class Model(ABC):
Expand Down Expand Up @@ -33,7 +34,7 @@ class OpenAIChatModel(Model):

def __init__(self, *args, **kwargs):
super().__init__(vendor_type="openai", *args, **kwargs)
load_env()
load_dotenv(override=True)
self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

@retrying.retry(stop_max_attempt_number=3)
Expand Down
88 changes: 88 additions & 0 deletions wenling/common/notion_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import asyncio
import datetime
import os
from typing import Any, Dict, List, Optional, Tuple

from notion_client import AsyncClient

from wenling.common.utils import Logger


class NotionQuery:
def __init__(self, database_id: str, verbose: bool = False):
self.database_id = database_id
self.verbose = verbose
self.logger = Logger(logger_name=os.path.basename(__file__), verbose=verbose)
self.token = os.environ.get("NOTION_TOKEN")
if not self.token:
raise ValueError("Please set the Notion token in .env.")
self.notion = AsyncClient(auth=self.token)

async def query_pages(self, start_date: str, end_date: str, tags: Optional[List[str]] = None) -> List[Any]:
"""
Query the database with given date range and tags.
The start and end dates that represent the range can be the same date, or a range of dates.
The tags are optional, and can be empty.
Args:
start_date (str): The start date of the range.
end_date (str): The end date of the range.
tags (Optional[List[str]]): Optional list of tags to filter the results. Defaults to None.
Returns:
List[str]: A list of page IDs that match the query.
"""

# Construct the filter conditions based on the start and end dates
filter_conditions = []
if start_date:
filter_conditions.append({"property": "Archive Date", "date": {"on_or_after": start_date}})
if end_date:
filter_conditions.append({"property": "Archive Date", "date": {"on_or_before": end_date}})

# Add filter conditions for tags if provided
if tags:
for tag in tags:
filter_conditions.append({"property": "Tags", "multi_select": {"contains": tag}})

# Perform the query using the constructed filter conditions
try:
if self.verbose:
self.logger.info(f"Querying Notion database with filter conditions: {filter_conditions}")
results = await self.notion.databases.query(
database_id=self.database_id, filter={"and": filter_conditions} if filter_conditions else None
)
if not results:
self.logger.info("No results found.")
return []
# Extract the page_ids from the results.
page_ids = [page.get("id") for page in results.get("results")]
if self.verbose:
self.logger.info(f"Retrieved {len(page_ids)} results.")
return page_ids
except Exception as e:
self.logger.error(f"Error querying Notion database: {e}")
return []

async def query_page_contents(self, page_id: str) -> Tuple[str, str, List[str]]:
"""
Query the contents of the page with the given page_id.
Args:
page_id (str): The ID of the page to query.
Returns:
Tuple[str, str, str]: A tuple containing the URL, title, and tags of the page.
"""
# Query the contents of the page with the given page_id
page = await self.notion.pages.retrieve(page_id=page_id)
if self.verbose:
self.logger.info(f"Retrieved page: {page}")
# Extract the URL and title properties from the page
url = page.get("properties").get("URL").get("rich_text")[0].get("text")
title = page.get("properties").get("Title").get("title")[0].get("text").get("content")
tags_blob = page.get("properties").get("Tags").get("multi_select")
tags = [tag.get("name") for tag in tags_blob]
# Return the URL and title as a tuple
return url, title, tags
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Store the data in Notion.
"""

import asyncio
import os
from typing import Any, Dict, List
Expand Down Expand Up @@ -173,6 +174,7 @@ async def _add_to_database(self, database_id: str, json_obj: Dict[str, Any]) ->
"Archive Date": {"start": properties["datetime"]},
"Tags": [{"name": tag} for tag in properties.get("tags", [])],
"Status": {"name": properties.get("status", "Archived")},
"Notes": [{"type": "text", "text": {"content": properties.get("notes", "")}}],
"URL": [{"type": "text", "text": {"content": properties.get("url", "")}}],
}
children = await self._create_page_blocks(json_obj["children"])
Expand Down
Loading

0 comments on commit 34c85f7

Please sign in to comment.