Merge pull request #19 from small-thinking/add-notion-query

1Add query page function
small-thinking · Feb 13, 2024 · 34c85f7 · 34c85f7
2 parents 36a65ec + 92e41f8
commit 34c85f7
Show file tree

Hide file tree

Showing 10 changed files with 207 additions and 39 deletions.
diff --git a/.github/workflows/test-build.yaml b/.github/workflows/test-build.yaml
@@ -26,7 +26,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.11
+          python-version: '3.10'
 
       - name: Install dependencies
         run: |
@@ -43,7 +43,7 @@ jobs:
       - name: Install lint dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install flake8 mypy==1.1.1 isort black
+          pip install flake8 mypy==1.1.1 isort black==23.12.0
       - name: Run black
         run: |
           black --check .

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ pymemgpt = "^0.2.7"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.4.3"
-black = "^23.11.0"
+black = "23.12.0"
 flake8 = "^6.1.0"
 isort = "^5.12.0"
 mypy = "^1.7.1"

diff --git a/wenling/archiver.py b/wenling/archiver.py
@@ -1,16 +1,17 @@
 """
 """
-import asyncio
+
 import json
 import os
 import re
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
 
 from bs4 import BeautifulSoup, Tag
+from dotenv import load_dotenv  # type: ignore
 
 from wenling.common.model_utils import OpenAIChatModel
-from wenling.common.notion_utils import NotionStorage
+from wenling.common.notion_storage import NotionStorage
 from wenling.common.utils import *
 
 
@@ -27,26 +28,26 @@ def __init__(self, verbose: bool = False):
         ]
         self.default_archiver = WebPageArchiver(verbose=verbose)
 
-    async def archive(self, url: str) -> str:
+    async def archive(self, url: str, notes: Optional[str] = None) -> str:
         """Match the url with pattern and find the corresponding archiver."""
         for archiver in self.archivers:
             if re.match(pattern=archiver["match_regex"], string=url):
                 if self.verbose:
-                    self.logger.info(f"Archive url with archiver {archiver['archiver'].name}...")
-                page_id = await archiver["archiver"].archive(url)
+                    self.logger.info(f"Archive url with archiver {archiver['archiver'].name} with notes {notes}...")
+                page_id = await archiver["archiver"].archive(url=url, notes=notes)
                 return page_id
         # Match to general web archiver by default.
         if self.verbose:
             self.logger.info(f"Archive url with archiver general web archiver...")
-        page_id = await self.default_archiver.archive(url)
+        page_id = await self.default_archiver.archive(url=url, notes=notes)
         return page_id
 
 
 class Archiver(ABC):
     """Archiver is a tool used to archive the bookmarked articles."""
 
-    def __init__(self, vendor_type: str = "openai", verbose: bool = False, **kewargs):
-        load_env()
+    def __init__(self, vendor_type: str = "openai", verbose: bool = False, **kwargs):
+        load_dotenv(override=True)
         self.api_key = os.getenv("ARCHIVER_API_KEY")
         self.verbose = verbose
         self.logger = Logger(logger_name=os.path.basename(__file__), verbose=verbose)
@@ -61,6 +62,40 @@ def __init__(self, vendor_type: str = "openai", verbose: bool = False, **kewargs
     def _extra_setup(self):
         pass
 
+    async def _auto_tagging(self, paragraphs: List[Dict[str, Any]]) -> List[str]:
+        """Leverage the LLM to auto-generate the tags based on the contents."""
+        contents_str = "\n".join([paragraph.get("text", "") for paragraph in paragraphs])
+        prompt = f"""
+        Please help generate the tags based on the contents below:
+        ---
+        {contents_str}
+        ---
+        
+        Some suggested tags:
+        1. If this article is about building agent, please add the tag Agent.
+        2. If this article is about LLM, please add the tag LLM.
+        3. If this article is about deep learning in general, please add the tag Deep Learning.
+        4. If this article is about tech philosophy, please add the tag Tech Philosophy.
+        5. Please use any other tags that you think are relevant.
+        
+        Please generate the tags in the same language as the contents, and return in below json format:
+        {{
+            "tags": ["tag1", "tag2", "tag3"]
+        }}
+        """
+        json_response_str = self.model.inference(
+            user_prompt=prompt, max_tokens=256, temperature=0.0, response_format="json_object"
+        )
+        try:
+            json_obj = json.loads(json_response_str)
+            tags = json_obj.get("tags", [])
+            if self.verbose:
+                self.logger.info(f"Auto-generated tags: {tags}")
+            return tags
+        except Exception as e:
+            self.logger.error(f"Error parsing the tags. Details: {str(e)}. Return empty tags.")
+            return []
+
     @abstractmethod
     def _set_name(self) -> str:
         pass
@@ -83,14 +118,14 @@ def _consolidate_content(self, content: List[Dict[str, Any]]) -> List[Dict[str,
                 consolidated_content.append(block)
         return consolidated_content
 
-    async def archive(self, url: str) -> str:
+    async def archive(self, url: str, notes: Optional[str] = None) -> str:
         if not check_url_exists(url):
             raise ValueError(f"The url {url} does not exist.")
-        article_json_obj = await self._archive(url)
+        article_json_obj = await self._archive(url=url, notes=notes)
         return await self.notion_store.store(json_obj=article_json_obj)
 
     @abstractmethod
-    async def _archive(self, url: str) -> Dict[str, Any]:
+    async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any]:
         pass
 
     def list_archived(self) -> List[str]:
@@ -123,10 +158,13 @@ def _parse_author(self, element_bs: BeautifulSoup) -> str:
         """Get the author name from a sub element with class "rich_media_meta rich_media_meta_text",
         put it into {"type": "h2", "text": <author_name>}.
         """
+        author = ""
         author_element = element_bs.select_one(".rich_media_meta.rich_media_meta_text")
         if not author_element:
-            raise ValueError("Cannot find author element.")
-        author = author_element.get_text().strip()
+            if self.verbose:
+                self.logger.warning("Cannot find author element.")
+        else:
+            author = author_element.get_text().strip()
         return author
 
     def _parse_publish_time(self, element_bs: BeautifulSoup) -> Dict[str, str]:
@@ -143,10 +181,13 @@ def _parse_tags(self, element_bs: BeautifulSoup) -> List[str]:
         """Get the tags from a sub elements (not direct sub) each with class "article-tag__item",
         and put them into {"type": "text", "text": <comma separated tags>}
         """
+        tags = []
         tags_element = element_bs.select(".article-tag__item")
         if not tags_element:
-            raise ValueError("Cannot find tags element.")
-        tags = [tag.get_text().strip() for tag in tags_element]
+            if self.verbose:
+                self.logger.warning("Cannot find tags element.")
+        else:
+            tags = [tag.get_text().strip() for tag in tags_element]
         return tags
 
     def _parse_paragraph(self, paragraph_tag: Tag, cache: Dict[str, Any]) -> List[Dict[str, str]]:
@@ -271,7 +312,7 @@ def _parse_content(self, element_bs: BeautifulSoup) -> List[Dict[str, Any]]:
 
         return content_json_obj
 
-    async def _archive(self, url: str) -> Dict[str, Any]:
+    async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any]:
         """Get the content block from the web page with the path div#img-content.rich_media_wrp.
         Parse the elements and put them into a json object with list of elements.
         """
@@ -288,16 +329,26 @@ async def _archive(self, url: str) -> Dict[str, Any]:
                 "children": paragraphs,
             }
             article_json_obj["properties"]["url"] = url
+            article_json_obj["properties"]["notes"] = notes if notes else ""
             article_json_obj["properties"]["title"] = self._parse_title(element_bs=element_bs)
             article_json_obj["properties"]["type"] = "微信"
+            # Convert date time to needed format.
             article_json_obj["properties"]["datetime"] = get_datetime()
             tags = self._parse_tags(element_bs=element_bs) + [self._parse_author(element_bs=element_bs)]
-            tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
+            if tags:
+                tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
+            else:
+                tags = []
+            self.logger.info(f"Auto-generate tags based on the contents...")
+            auto_tags = await self._auto_tagging(paragraphs)
+            tags.extend(auto_tags)
             article_json_obj["properties"]["tags"] = tags
+
             if self.verbose:
                 json_object_str = json.dumps(article_json_obj, indent=2)
                 self.logger.info(f"Archived article: {json_object_str}")
         except Exception as e:
+            self.logger.error(f"Error parsing content. Details: {str(e)}")
             raise ValueError(f"Error parsing content. Details: {str(e)}")
         finally:
             return article_json_obj
@@ -468,7 +519,7 @@ async def _parse_tags(self, paragraphs: List[Dict[str, Any]]) -> List[str]:
             self.logger.error(f"Error parsing the tags. Details: {str(e)}. Return empty tags.")
             return []
 
-    async def _archive(self, url: str) -> Dict[str, Any]:
+    async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any]:
         """Get the content block from the web page.
         Parse the elements and put them into a json object with list of elements.
         """
@@ -484,6 +535,7 @@ async def _archive(self, url: str) -> Dict[str, Any]:
         }
         try:
             article_json_obj["properties"]["url"] = url
+            article_json_obj["properties"]["notes"] = notes if notes else ""
             article_json_obj["properties"]["title"] = self._parse_title(element_bs=element_bs)
             article_json_obj["properties"]["type"] = "网页"
             article_json_obj["properties"]["datetime"] = get_datetime()
@@ -492,6 +544,9 @@ async def _archive(self, url: str) -> Dict[str, Any]:
             # Leverage LLM to generate the tags based on the article json obj contents.
             tags = await self._parse_tags(contents)
             tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
+            self.logger.info(f"Auto-generate tags based on the contents...")
+            auto_tags = [tag.replace("#", "") for tag in tags if len(tag) > 1]
+            tags += auto_tags
             article_json_obj["properties"]["tags"] = tags
             if self.verbose:
                 json_object_str = json.dumps(article_json_obj, indent=2)

diff --git a/wenling/common/image_upload.py b/wenling/common/image_upload.py
@@ -10,9 +10,10 @@
 import flickrapi
 import requests
 import retrying
+from dotenv import load_dotenv  # type: ignore
 from flickrapi.auth import FlickrAccessToken
 
-from wenling.common.utils import Logger, load_env
+from wenling.common.utils import Logger
 
 
 async def upload_image_to_flickr(
@@ -34,7 +35,7 @@ async def upload_image_to_flickr(
     Raises:
     - Exception: If there is an error in uploading or parsing the response.
     """
-    load_env()
+    load_dotenv(override=True)
 
     api_key = os.environ.get("FLICKR_API_KEY")
     api_secret = os.environ.get("FLICKR_SECRET")

diff --git a/wenling/common/model_utils.py b/wenling/common/model_utils.py
@@ -3,8 +3,9 @@
 
 import openai
 import retrying
+from dotenv import load_dotenv  # type: ignore
 
-from wenling.common.utils import Logger, load_env
+from wenling.common.utils import Logger
 
 
 class Model(ABC):
@@ -33,7 +34,7 @@ class OpenAIChatModel(Model):
 
     def __init__(self, *args, **kwargs):
         super().__init__(vendor_type="openai", *args, **kwargs)
-        load_env()
+        load_dotenv(override=True)
         self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
     @retrying.retry(stop_max_attempt_number=3)

diff --git a/wenling/common/notion_query.py b/wenling/common/notion_query.py
@@ -0,0 +1,88 @@
+import asyncio
+import datetime
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+from notion_client import AsyncClient
+
+from wenling.common.utils import Logger
+
+
+class NotionQuery:
+    def __init__(self, database_id: str, verbose: bool = False):
+        self.database_id = database_id
+        self.verbose = verbose
+        self.logger = Logger(logger_name=os.path.basename(__file__), verbose=verbose)
+        self.token = os.environ.get("NOTION_TOKEN")
+        if not self.token:
+            raise ValueError("Please set the Notion token in .env.")
+        self.notion = AsyncClient(auth=self.token)
+
+    async def query_pages(self, start_date: str, end_date: str, tags: Optional[List[str]] = None) -> List[Any]:
+        """
+        Query the database with given date range and tags.
+        The start and end dates that represent the range can be the same date, or a range of dates.
+        The tags are optional, and can be empty.
+
+        Args:
+            start_date (str): The start date of the range.
+            end_date (str): The end date of the range.
+            tags (Optional[List[str]]): Optional list of tags to filter the results. Defaults to None.
+
+        Returns:
+            List[str]: A list of page IDs that match the query.
+
+        """
+
+        # Construct the filter conditions based on the start and end dates
+        filter_conditions = []
+        if start_date:
+            filter_conditions.append({"property": "Archive Date", "date": {"on_or_after": start_date}})
+        if end_date:
+            filter_conditions.append({"property": "Archive Date", "date": {"on_or_before": end_date}})
+
+        # Add filter conditions for tags if provided
+        if tags:
+            for tag in tags:
+                filter_conditions.append({"property": "Tags", "multi_select": {"contains": tag}})
+
+        # Perform the query using the constructed filter conditions
+        try:
+            if self.verbose:
+                self.logger.info(f"Querying Notion database with filter conditions: {filter_conditions}")
+            results = await self.notion.databases.query(
+                database_id=self.database_id, filter={"and": filter_conditions} if filter_conditions else None
+            )
+            if not results:
+                self.logger.info("No results found.")
+                return []
+            # Extract the page_ids from the results.
+            page_ids = [page.get("id") for page in results.get("results")]
+            if self.verbose:
+                self.logger.info(f"Retrieved {len(page_ids)} results.")
+            return page_ids
+        except Exception as e:
+            self.logger.error(f"Error querying Notion database: {e}")
+            return []
+
+    async def query_page_contents(self, page_id: str) -> Tuple[str, str, List[str]]:
+        """
+        Query the contents of the page with the given page_id.
+
+        Args:
+            page_id (str): The ID of the page to query.
+
+        Returns:
+            Tuple[str, str, str]: A tuple containing the URL, title, and tags of the page.
+        """
+        # Query the contents of the page with the given page_id
+        page = await self.notion.pages.retrieve(page_id=page_id)
+        if self.verbose:
+            self.logger.info(f"Retrieved page: {page}")
+        # Extract the URL and title properties from the page
+        url = page.get("properties").get("URL").get("rich_text")[0].get("text")
+        title = page.get("properties").get("Title").get("title")[0].get("text").get("content")
+        tags_blob = page.get("properties").get("Tags").get("multi_select")
+        tags = [tag.get("name") for tag in tags_blob]
+        # Return the URL and title as a tuple
+        return url, title, tags
diff --git a/wenling/common/notion_utils.py → wenling/common/notion_storage.py b/wenling/common/notion_utils.py → wenling/common/notion_storage.py
@@ -1,5 +1,6 @@
 """Store the data in Notion.
 """
+
 import asyncio
 import os
 from typing import Any, Dict, List
@@ -173,6 +174,7 @@ async def _add_to_database(self, database_id: str, json_obj: Dict[str, Any]) ->
             "Archive Date": {"start": properties["datetime"]},
             "Tags": [{"name": tag} for tag in properties.get("tags", [])],
             "Status": {"name": properties.get("status", "Archived")},
+            "Notes": [{"type": "text", "text": {"content": properties.get("notes", "")}}],
             "URL": [{"type": "text", "text": {"content": properties.get("url", "")}}],
         }
         children = await self._create_page_blocks(json_obj["children"])