diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a526042..5868315 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,8 +17,10 @@ env: jobs: lint: - name: Linting + name: Lint runs-on: ubuntu-latest + permissions: + contents: read steps: - uses: actions/checkout@v3 @@ -41,8 +43,10 @@ jobs: uses: pre-commit/action@v3.0.0 test: - name: Python ${{ matrix.python-version }} + name: Test (Python ${{ matrix.python-version }}) runs-on: ubuntu-latest + permissions: + contents: read needs: - lint strategy: @@ -89,10 +93,12 @@ jobs: docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml down build: - name: Build project + name: Build runs-on: ubuntu-latest + permissions: + contents: read needs: - - test + - lint steps: - uses: actions/checkout@v3 @@ -118,14 +124,51 @@ jobs: if-no-files-found: error retention-days: 7 - publish: - name: Publish project + create-release: + name: Release + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + permissions: + contents: write + needs: + - build + - test + steps: + - + uses: actions/checkout@v3 + - + uses: actions/download-artifact@v3 + with: + name: artifacts + path: dist + - + name: Get latest release info + id: query-release-info + uses: release-flow/keep-a-changelog-action@v2 + with: + command: query + version: ${{ github.ref_name }} + - + name: Display release info + run: | + echo "$Version: ${{ steps.query-release-info.outputs.version }}" + echo "$Date: ${{ steps.query-release-info.outputs.release-date }}" + echo "${{ steps.query-release-info.outputs.release-notes }}" + - + uses: ncipollo/release-action@v1 + with: + artifacts: "dist/*.tar.gz,dist/*.whl" + body: ${{ steps.query-release-info.outputs.release-notes }} + + pypi-publish: + name: Publish runs-on: ubuntu-latest if: startsWith(github.ref, 'refs/tags/') permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing needs: - build + - test steps: - uses: actions/download-artifact@v3 @@ -134,4 +177,4 @@ jobs: path: dist - name: Publish build to PyPI - uses: pypa/gh-action-pypi-publish@v1.8.6 + uses: pypa/gh-action-pypi-publish@v1.8.7 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 17f5e5b..d93dcaa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: check-case-conflict - id: detect-private-key - repo: https://github.com/pre-commit/mirrors-prettier - rev: 'v2.7.1' + rev: 'v3.0.0' hooks: - id: prettier types_or: @@ -37,10 +37,10 @@ repos: exclude: "(^Pipfile\\.lock$)" # Python hooks - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.0.265' + rev: 'v0.0.278' hooks: - id: ruff - repo: https://github.com/psf/black - rev: 23.3.0 + rev: 23.7.0 hooks: - id: black diff --git a/CHANGELOG.md b/CHANGELOG.md index e6f6216..5287c98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.0] - 2023-07-19 + +### Added + +- Restricted action permissions to minimal requirements to function +- Github CI also now creates a Github release with sdist, wheel and changelog +- Additional classifiers to the project on PyPI + +### Fixed + +- Handling of ISO-8061 dates with fractional seconds, which Python doesn't support natively + ## [0.2.0] - 2023-06-26 ### Fixed diff --git a/TODO.md b/TODO.md index 1f434cb..e3ebed5 100644 --- a/TODO.md +++ b/TODO.md @@ -1,5 +1,5 @@ # TODO -- Create a mixin for some fields which seem often paired together, such as `dc:created` and `dc:modified` - Testing with sample email documents - Testing with the sample image documents +- Add Github pages with more detailed documentation diff --git a/pyproject.toml b/pyproject.toml index a48ff3d..51d1890 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,15 +15,22 @@ authors = [ ] classifiers = [ "Development Status :: 4 - Beta", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Environment :: Web Environment", "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Python :: Implementation :: PyPy" ] -dependencies = ["httpx"] +dependencies = ["httpx ~= 0.24"] [project.urls] Documentation = "https://github.com/stumpylog/tika-rest-client#readme" diff --git a/src/tika_client/__about__.py b/src/tika_client/__about__.py index d3ec452..493f741 100644 --- a/src/tika_client/__about__.py +++ b/src/tika_client/__about__.py @@ -1 +1 @@ -__version__ = "0.2.0" +__version__ = "0.3.0" diff --git a/src/tika_client/data_models.py b/src/tika_client/data_models.py index 1d5fd74..ee98bd2 100644 --- a/src/tika_client/data_models.py +++ b/src/tika_client/data_models.py @@ -1,4 +1,7 @@ +import logging +import re from datetime import datetime +from datetime import timedelta from enum import Enum from typing import Dict from typing import List @@ -7,6 +10,9 @@ # Based on https://cwiki.apache.org/confluence/display/TIKA/Metadata+Overview +logger = logging.getLogger("tika-client.data") +_FRACTION_REGEX = re.compile("(.*)([\\.,][0-9]+)(.*)") + class TikaKey(str, Enum): Parsers = "X-TIKA:Parsed-By" @@ -96,8 +102,34 @@ def get_optional_datetime(self, key: Union[TikaKey, DublinCoreKey, XmpKey, str]) """ if key not in self.data: # pragma: no cover return None + + date_str: str = self.data[key] + + # Handle fractional seconds + frac = _FRACTION_REGEX.match(date_str) + if frac is not None: + logger.info("Located fractional seconds") + delta = timedelta(seconds=float(frac.group(2))) + date_str = frac.group(1) + # Attempt to include the timezone info still + if frac.group(3) is not None: + date_str += frac.group(3) + else: + delta = timedelta() + # Handle Zulu time as UTC - return datetime.fromisoformat(self.data[key].replace("Z", "+00:00")) + if "Z" in date_str: + date_str = date_str.replace("Z", "+00:00") + + # Assume UTC if it is not set + if "+" not in date_str: + date_str += "+00:00" + + try: + return datetime.fromisoformat(date_str) + delta + except ValueError as e: + logger.error(f"{e} during datetime parsing") + return None def get_optional_string(self, key: Union[TikaKey, DublinCoreKey, XmpKey, str]) -> Optional[str]: if key not in self.data: diff --git a/tests/samples/README.md b/tests/samples/README.md index 513ce17..7ea407c 100644 --- a/tests/samples/README.md +++ b/tests/samples/README.md @@ -1,2 +1,5 @@ sample.jpg - https://unsplash.com/photos/8OyKWQgBsKQ sample.png - https://unsplash.com/photos/iar-afB0QQw + +microsoft-sample.docx - produced by Microsoft Office +sample-libre-office.odt - produced by LibreOffice Writer 7.5.12 diff --git a/tests/samples/sample-libre-office.odt b/tests/samples/sample-libre-office.odt new file mode 100644 index 0000000..405e4a6 Binary files /dev/null and b/tests/samples/sample-libre-office.odt differ diff --git a/tests/test_file_formats.py b/tests/test_file_formats.py new file mode 100644 index 0000000..6796d7c --- /dev/null +++ b/tests/test_file_formats.py @@ -0,0 +1,31 @@ +from datetime import datetime +from datetime import timezone + +import magic + +from tests.conftest import SAMPLE_DIR +from tika_client.client import TikaClient + + +class TestLibreOfficeFormats: + def test_parse_libre_office_writer_document(self, tika_client: TikaClient): + test_file = SAMPLE_DIR / "sample-libre-office.odt" + resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True)) + + assert resp.type == "application/vnd.oasis.opendocument.text" + assert ( + "

This is a document created by LibreOffice Writer 7.5.12, on July 19th, 2023

\n" + in resp.content + ) + assert resp.content_length == 11149 + assert resp.created is not None + assert resp.created == datetime( + year=2023, + month=7, + day=19, + hour=11, + minute=30, + second=44, + microsecond=719000, + tzinfo=timezone.utc, + )