Skip to content

Commit

Permalink
Merge branch 'release/0.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed Jul 19, 2023
2 parents 7c876ed + 7acf517 commit 4e2f2cb
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 15 deletions.
57 changes: 50 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ env:

jobs:
lint:
name: Linting
name: Lint
runs-on: ubuntu-latest
permissions:
contents: read
steps:
-
uses: actions/checkout@v3
Expand All @@ -41,8 +43,10 @@ jobs:
uses: pre-commit/[email protected]

test:
name: Python ${{ matrix.python-version }}
name: Test (Python ${{ matrix.python-version }})
runs-on: ubuntu-latest
permissions:
contents: read
needs:
- lint
strategy:
Expand Down Expand Up @@ -89,10 +93,12 @@ jobs:
docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml down
build:
name: Build project
name: Build
runs-on: ubuntu-latest
permissions:
contents: read
needs:
- test
- lint
steps:
-
uses: actions/checkout@v3
Expand All @@ -118,14 +124,51 @@ jobs:
if-no-files-found: error
retention-days: 7

publish:
name: Publish project
create-release:
name: Release
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
permissions:
contents: write
needs:
- build
- test
steps:
-
uses: actions/checkout@v3
-
uses: actions/download-artifact@v3
with:
name: artifacts
path: dist
-
name: Get latest release info
id: query-release-info
uses: release-flow/keep-a-changelog-action@v2
with:
command: query
version: ${{ github.ref_name }}
-
name: Display release info
run: |
echo "$Version: ${{ steps.query-release-info.outputs.version }}"
echo "$Date: ${{ steps.query-release-info.outputs.release-date }}"
echo "${{ steps.query-release-info.outputs.release-notes }}"
-
uses: ncipollo/release-action@v1
with:
artifacts: "dist/*.tar.gz,dist/*.whl"
body: ${{ steps.query-release-info.outputs.release-notes }}

pypi-publish:
name: Publish
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
permissions:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
needs:
- build
- test
steps:
-
uses: actions/download-artifact@v3
Expand All @@ -134,4 +177,4 @@ jobs:
path: dist
-
name: Publish build to PyPI
uses: pypa/[email protected].6
uses: pypa/[email protected].7
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ repos:
- id: check-case-conflict
- id: detect-private-key
- repo: https://github.com/pre-commit/mirrors-prettier
rev: 'v2.7.1'
rev: 'v3.0.0'
hooks:
- id: prettier
types_or:
Expand All @@ -37,10 +37,10 @@ repos:
exclude: "(^Pipfile\\.lock$)"
# Python hooks
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.265'
rev: 'v0.0.278'
hooks:
- id: ruff
- repo: https://github.com/psf/black
rev: 23.3.0
rev: 23.7.0
hooks:
- id: black
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.3.0] - 2023-07-19

### Added

- Restricted action permissions to minimal requirements to function
- Github CI also now creates a Github release with sdist, wheel and changelog
- Additional classifiers to the project on PyPI

### Fixed

- Handling of ISO-8061 dates with fractional seconds, which Python doesn't support natively

## [0.2.0] - 2023-06-26

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion TODO.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# TODO

- Create a mixin for some fields which seem often paired together, such as `dc:created` and `dc:modified`
- Testing with sample email documents
- Testing with the sample image documents
- Add Github pages with more detailed documentation
11 changes: 9 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,22 @@ authors = [
]
classifiers = [
"Development Status :: 4 - Beta",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Environment :: Web Environment",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Programming Language :: Python :: Implementation :: PyPy"
]
dependencies = ["httpx"]
dependencies = ["httpx ~= 0.24"]

[project.urls]
Documentation = "https://github.com/stumpylog/tika-rest-client#readme"
Expand Down
2 changes: 1 addition & 1 deletion src/tika_client/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.0"
__version__ = "0.3.0"
34 changes: 33 additions & 1 deletion src/tika_client/data_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import logging
import re
from datetime import datetime
from datetime import timedelta
from enum import Enum
from typing import Dict
from typing import List
Expand All @@ -7,6 +10,9 @@

# Based on https://cwiki.apache.org/confluence/display/TIKA/Metadata+Overview

logger = logging.getLogger("tika-client.data")
_FRACTION_REGEX = re.compile("(.*)([\\.,][0-9]+)(.*)")


class TikaKey(str, Enum):
Parsers = "X-TIKA:Parsed-By"
Expand Down Expand Up @@ -96,8 +102,34 @@ def get_optional_datetime(self, key: Union[TikaKey, DublinCoreKey, XmpKey, str])
"""
if key not in self.data: # pragma: no cover
return None

date_str: str = self.data[key]

# Handle fractional seconds
frac = _FRACTION_REGEX.match(date_str)
if frac is not None:
logger.info("Located fractional seconds")
delta = timedelta(seconds=float(frac.group(2)))
date_str = frac.group(1)
# Attempt to include the timezone info still
if frac.group(3) is not None:
date_str += frac.group(3)
else:
delta = timedelta()

# Handle Zulu time as UTC
return datetime.fromisoformat(self.data[key].replace("Z", "+00:00"))
if "Z" in date_str:
date_str = date_str.replace("Z", "+00:00")

# Assume UTC if it is not set
if "+" not in date_str:
date_str += "+00:00"

try:
return datetime.fromisoformat(date_str) + delta
except ValueError as e:
logger.error(f"{e} during datetime parsing")
return None

def get_optional_string(self, key: Union[TikaKey, DublinCoreKey, XmpKey, str]) -> Optional[str]:
if key not in self.data:
Expand Down
3 changes: 3 additions & 0 deletions tests/samples/README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
sample.jpg - https://unsplash.com/photos/8OyKWQgBsKQ
sample.png - https://unsplash.com/photos/iar-afB0QQw

microsoft-sample.docx - produced by Microsoft Office
sample-libre-office.odt - produced by LibreOffice Writer 7.5.12
Binary file added tests/samples/sample-libre-office.odt
Binary file not shown.
31 changes: 31 additions & 0 deletions tests/test_file_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from datetime import datetime
from datetime import timezone

import magic

from tests.conftest import SAMPLE_DIR
from tika_client.client import TikaClient


class TestLibreOfficeFormats:
def test_parse_libre_office_writer_document(self, tika_client: TikaClient):
test_file = SAMPLE_DIR / "sample-libre-office.odt"
resp = tika_client.tika.as_html.from_file(test_file, magic.from_file(str(test_file), mime=True))

assert resp.type == "application/vnd.oasis.opendocument.text"
assert (
"<body><p>This is a document created by LibreOffice Writer 7.5.12, on July 19th, 2023</p>\n</body>"
in resp.content
)
assert resp.content_length == 11149
assert resp.created is not None
assert resp.created == datetime(
year=2023,
month=7,
day=19,
hour=11,
minute=30,
second=44,
microsecond=719000,
tzinfo=timezone.utc,
)

0 comments on commit 4e2f2cb

Please sign in to comment.