Skip to content

Commit

Permalink
added 250+ new ontologies; further rework and refinements ...
Browse files Browse the repository at this point in the history
summary:
- added a way to include manually maintained ontologies
- added a workflow to check manually maintained ontologies
  to find broken links
  • Loading branch information
k00ni committed Apr 3, 2024
1 parent e39e383 commit eb351e6
Show file tree
Hide file tree
Showing 14 changed files with 696 additions and 15 deletions.
19 changes: 19 additions & 0 deletions .github/workflows/CheckManuallyMaintainedMetadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Check manually maintained metadata about ontologies

on: [push, pull_request]

jobs:
CI:
runs-on: ubuntu-latest
steps:
- name: Set up Git repository
uses: actions/checkout@v3

- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: '8.2'
ini-values: memory_limit=1G

- name: Check
run: scripts/bin/check-manually-maintained-metadata
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ index:
rm -f scripts/var/temporary-index.db
scripts/bin/read-dbpedia-archivo
scripts/bin/read-linked-open-vocabularies
scripts/bin/merge-in-manually-maintained-metadata
scripts/bin/write-index-csv

prepare:
Expand Down
12 changes: 10 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
FROM php:8.2-cli
FROM php:8.3-cli

# install and setup required system library
RUN apt-get update && apt-get install -y --no-install-recommends \
curl git gnupg libicu-dev libpng-dev libzip-dev nano net-tools sudo unzip wget zlib1g-dev

RUN docker-php-ext-install -j$(nproc) intl zip

# install Composer globally (required to install PHP vendors)
RUN curl -sS https://getcomposer.org/installer | php && mv composer.phar /usr/local/bin/composer

# add a non-root user to limit user rights
RUN useradd -r --home /home/govi -u 1000 govi
RUN usermod -a -G www-data govi
RUN mkdir /home/govi
RUN chown govi:www-data /home/govi
RUN adduser govi sudo

# add custom PHP.ini settings
RUN mv "$PHP_INI_DIR/php.ini-development" "$PHP_INI_DIR/php.ini"
COPY ./custom.ini /usr/local/etc/php/conf.d/custom.ini

# create and mark working dir
RUN mkdir /govi
WORKDIR /govi

# run this after container started to keep it alive
CMD ["tail -f /dev/null"]
9 changes: 9 additions & 0 deletions docker/custom.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
memory_limit = 1G
upload_max_filesize = 64M
post_max_size = 64M
max_execution_time = 600

; report all errors
error_reporting = E_ALL
display_errors = On
display_startup_errors = On
228 changes: 228 additions & 0 deletions index.csv

Large diffs are not rendered by default.

247 changes: 247 additions & 0 deletions manually-maintained-metadata-about-ontologies.csv

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions scripts/bin/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

// paths
define('ROOT_DIR_PATH', __DIR__.DIRECTORY_SEPARATOR.'..'.DIRECTORY_SEPARATOR.'..'.DIRECTORY_SEPARATOR);
define('SCRIPTS_DIR_PATH', ROOT_DIR_PATH.DIRECTORY_SEPARATOR.'scripts'.DIRECTORY_SEPARATOR);
define('SCRIPTS_DIR_PATH', ROOT_DIR_PATH.'scripts'.DIRECTORY_SEPARATOR);
define('INDEX_CSV_PATH', ROOT_DIR_PATH.'index.csv');

// CSV
Expand All @@ -11,7 +11,9 @@
'"ontology title","ontology uri","latest n3 file","latest ntriples file","latest rdf/xml file","latest turtle file","latest access","source title","source url"'
);

define('MANUALLY_MAINTAINED_METADATA_ABOUT_ONTOLOGIES_CSV', 'manually-maintained-metadata-about-ontologies.csv');

define('SQLITE_FILE_PATH', SCRIPTS_DIR_PATH.'var'.DIRECTORY_SEPARATOR.'temporary-index.db');

// include vendor libraries
require SCRIPTS_DIR_PATH.'vendor'.DIRECTORY_SEPARATOR.'autoload.php';
require_once SCRIPTS_DIR_PATH.'vendor'.DIRECTORY_SEPARATOR.'autoload.php';
67 changes: 67 additions & 0 deletions scripts/bin/check-manually-maintained-metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env php
<?php

/**
* This file checks availability of all RDF files in manually-maintained-metadata-about-ontologies.csv
*/

declare(strict_types=1);

use function App\isEmpty;
use function App\urlIsAccessible;

require_once 'bootstrap.php';

echo PHP_EOL.'Check availability of referenced RDF files ...'.PHP_EOL;

$failingRdfFiles = [];

$csvFileEntries = array_map('str_getcsv', file(__DIR__.'/../../'.MANUALLY_MAINTAINED_METADATA_ABOUT_ONTOLOGIES_CSV));
foreach ($csvFileEntries as $line => $entry) {
if (0 == $line) {
continue;
}

echo '.';

/** @var string|null */
$rdfFile = null;
// takes the first defined RDF file and checks it
foreach ([2, 3, 4, 5] as $key) {
if (isset($entry[$key]) && false === isEmpty($entry[$key])) {
$rdfFile = $entry[$key];
break;
}
}

if (isEmpty($rdfFile)) {
throw new Exception($entry[0].' has no related RDF File set');
} else {
if (urlIsAccessible($rdfFile, 30, 10)) {
// OK
} else {
$failingRdfFiles[] = [
'title' => $entry[0],
'rdf-file' => $rdfFile,
];
}
}

sleep(1);
}

$failingRdfFilesWereFound = 0 < count($failingRdfFiles);
if ($failingRdfFilesWereFound) {
echo PHP_EOL;
echo PHP_EOL;
echo 'The following RDF files are not accessible:';
echo PHP_EOL;
var_dump($failingRdfFiles);
echo PHP_EOL;
return 1;
} else {
echo PHP_EOL;
echo PHP_EOL;

return 0;
}
74 changes: 74 additions & 0 deletions scripts/bin/merge-in-manually-maintained-metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env php
<?php

declare(strict_types=1);

use App\IndexEntry;

use function App\isUrl;
use function App\storeTemporaryIndexIntoSQLiteFile;

/**
* Merges manually maintained metadata into the SQLite file, if not known already.
*/

require 'bootstrap.php';

echo PHP_EOL.'Merge in manually maintained metadata ...';

/*
* open DB and get a list of all entries
*/
$db = new PDO('sqlite:'.SQLITE_FILE_PATH);
$sql = 'SELECT ontology_uri FROM entry ORDER BY ontology_title ASC';
$stmt = $db->prepare($sql);
$stmt->execute();
$knownOntologyUris = $stmt->fetchAll(PDO::FETCH_COLUMN);

/*
* merge in all manually maintained metadata which is not already part of the SQLite file
*/
$simplifiedOntologyList = [];

// load CSV file and build simplified ontology list
$entries = array_map('str_getcsv', file(__DIR__.'/../../'.MANUALLY_MAINTAINED_METADATA_ABOUT_ONTOLOGIES_CSV));
$unkownIndexEntries = [];
foreach ($entries as $line => $row) {
if (0 == $line) {
// ignore header
continue;
}
$ontologyUri = $row[1];

// check if ontology URI is already known
$ontologyIsNotInSQLiteFileAlready = false === in_array($ontologyUri, $knownOntologyUris);
if ($ontologyIsNotInSQLiteFileAlready && isUrl($ontologyUri)) {
$entry = new IndexEntry('Manually maintained', 'https://github.com/k00ni/govi');
$entry->setOntologyTitle($row[0]);
$entry->setOntologyUri($ontologyUri);

// related files
$entry->setLatestN3File($row[2]);
$entry->setLatestNtFile($row[3]);
$entry->setLatestRdfXmlFile($row[4]);
$entry->setLatestTtlFile($row[5]);

$entry->setLatestAccess($row[6]);

$unkownIndexEntries[] = $entry;
} else {
$msg = 'Ontology '.$row[0].' ('.$ontologyUri.') is known and does not have to be maintained manually!';
$msg .= ' Please remove it from '.MANUALLY_MAINTAINED_METADATA_ABOUT_ONTOLOGIES_CSV;
throw new Exception($msg);
}
}

if (0 < count($unkownIndexEntries)) {
echo PHP_EOL.'Store '.count($unkownIndexEntries).' unknown entries into SQLite file';
storeTemporaryIndexIntoSQLiteFile($unkownIndexEntries);
} else {
throw new Exception('No entries from '.MANUALLY_MAINTAINED_METADATA_ABOUT_ONTOLOGIES_CSV.' to add to SQLite file!');
}

echo PHP_EOL;
echo PHP_EOL;
1 change: 0 additions & 1 deletion scripts/bin/read-linked-open-vocabularies
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use quickRdf\DataFactory;
use quickRdfIo\Util;
use rdfInterface\BlankNodeInterface;
use rdfInterface\LiteralInterface;
use rdfInterface\NamedNodeInterface;
use sweetrdf\InMemoryStoreSqlite\Store\InMemoryStoreSqlite;

use function App\isEmpty;
Expand Down
5 changes: 5 additions & 0 deletions scripts/bin/write-index-csv
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,18 @@ $stmt = $db->prepare($sql);
$stmt->execute();

$dataToWrite = INDEX_CSV_HEAD_STRING.PHP_EOL;
$i = 0;
while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
echo '.';

$dataToWrite .= '"'.implode('","', $row).'"'.PHP_EOL;

++$i;
}

file_put_contents(INDEX_CSV_PATH, $dataToWrite);

echo PHP_EOL.PHP_EOL.'Wrote '.$i.' entries';

echo PHP_EOL;
echo PHP_EOL;
1 change: 1 addition & 0 deletions scripts/composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"prefer-stable": true,
"require": {
"php": ">=8.2",
"php-curl-class/php-curl-class": "^9.19",
"sweetrdf/in-memory-store-sqlite": "^1.1.0",
"sweetrdf/quick-rdf": "^2.0",
"sweetrdf/quick-rdf-io": "^1.0"
Expand Down
40 changes: 30 additions & 10 deletions scripts/src/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

namespace App;

use Curl\Curl;
use Exception;
use PDO;
use PDOException;
Expand Down Expand Up @@ -31,16 +32,6 @@ function cleanTitle(string $str): string
return $str;
}

/**
* Very basic approach to check if a given string is an URL.
*/
function isUrl(string $str): bool
{
return str_starts_with($str, 'http://')
|| str_starts_with($str, 'https://')
|| str_starts_with($str, 'www.');
}

/**
* It seems that empty() is not enough to check, if something is really empty.
* This function takes care of the edge cases.
Expand All @@ -59,6 +50,16 @@ function isEmpty(string|null $input): bool
}
}

/**
* Very basic approach to check if a given string is an URL.
*/
function isUrl(string $str): bool
{
return str_starts_with($str, 'http://')
|| str_starts_with($str, 'https://')
|| str_starts_with($str, 'www.');
}

/**
* @param array<string,\App\IndexEntry> $temporaryIndex
*
Expand All @@ -69,6 +70,7 @@ function storeTemporaryIndexIntoSQLiteFile(array $temporaryIndex): void
// create/open SQLite file (= our database)
$db = new PDO('sqlite:'.SQLITE_FILE_PATH);

// TODO move to a better place to avoid unneccessary SQL commands
$db->exec('CREATE TABLE IF NOT EXISTS entry (
ontology_uri TEXT PRIMARY KEY,
ontology_title TEXT,
Expand Down Expand Up @@ -155,3 +157,21 @@ function uncompressGzArchive(string $sourceFilepath, string $targetFilepath): vo
throw new Exception('Uncompressing failed, could not open: '.$sourceFilepath);
}
}

function urlIsAccessible(string $url, int $timeout = 5, int $maximumRedirects = 10): bool
{
$curl = new Curl();
$curl->setOpt(CURLOPT_CONNECT_ONLY, true);
$curl->setConnectTimeout($timeout);
$curl->setMaximumRedirects($maximumRedirects);
$curl->setOpt(CURLOPT_FOLLOWLOCATION, true); // follow redirects
$curl->setOpt(CURLOPT_SSL_VERIFYPEER, false);
$curl->setOpt(CURLOPT_SSL_VERIFYHOST, false);

$curl->get($url);
if ($curl->error) {
return false;
} else {
return true;
}
}
1 change: 1 addition & 0 deletions scripts/var/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.*
!.gitignore
merge_cache

0 comments on commit eb351e6

Please sign in to comment.