diff --git a/00_parser.ipynb b/00_parser.ipynb index bc0fead..d23c016 100644 --- a/00_parser.ipynb +++ b/00_parser.ipynb @@ -22,10 +22,22 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dotenv extension is already loaded. To reload it, use:\n", + " %reload_ext dotenv\n" + ] + } + ], "source": [ "#hide\n", - "from nbdev.showdoc import *" + "from nbdev.showdoc import *\n", + "\n", + "%load_ext dotenv\n", + "%dotenv" ] }, { @@ -43,6 +55,7 @@ "source": [ "#export\n", "\n", + "import os\n", "from urllib.parse import urlparse, parse_qsl, parse_qs\n", "import requests\n", "import arrow\n", @@ -60,12 +73,13 @@ " date = '{}Z'.format(date_obj.format('YYYY-MM-DDT00:00:00'))\n", " return date\n", "\n", - "def parse_query(query):\n", + "def parse_query(query, api_version=2):\n", " '''\n", " Converts the parameters of a search using the Trove web interface into a form the API will understand.\n", " \n", " Parameters: \n", " * `query` – the url of a search in the Trove newspapers & gazettes category\n", + " * `api_version` – Trove API version (default is 2)\n", " \n", " Returns: \n", " * a dict containing the parameters (multiple values will be in a list)\n", @@ -124,10 +138,13 @@ " elif key == 'keyword.any':\n", " keywords.append('({})'.format(' OR '.join(value.split())))\n", " elif key in ['l-ArtType', 'l-advArtType', 'l-artType']:\n", - " if value == 'newspapers':\n", - " new_params['zone'] = 'newspaper'\n", - " elif value == 'gazette':\n", - " new_params['zone'] = 'gazette'\n", + " if api_version == 2:\n", + " if value == 'newspapers':\n", + " new_params['zone'] = 'newspaper'\n", + " elif value == 'gazette':\n", + " new_params['zone'] = 'gazette'\n", + " elif api_version == 3:\n", + " new_params['l-artType'] = value\n", " if keywords:\n", " if 'q' in new_params:\n", " new_params['q'] += ' AND {}'.format(' AND '.join(keywords))\n", @@ -145,12 +162,47 @@ " new_params['q'] = date_query\n", " if 'q' not in new_params:\n", " new_params['q'] = ' '\n", - " if 'zone' not in new_params:\n", + " if api_version == 2 and 'zone' not in new_params:\n", " new_params['zone'] = 'newspaper,gazette'\n", + " if api_version == 3 and 'category' not in new_params:\n", + " new_params['category'] = 'newspaper'\n", " # return '{}?{}'.format('https://api.trove.nla.gov.au/v2/result', urlencode(new_params, doseq=True))\n", " return new_params" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "
parse_query
[source]parse_query
(**`query`**, **`api_version`**=*`2`*)\n",
+ "\n",
+ "Converts the parameters of a search using the Trove web interface into a form the API will understand.\n",
+ "\n",
+ "Parameters: \n",
+ "* `query` – the url of a search in the Trove newspapers & gazettes category\n",
+ "* `api_version` – Trove API version (default is 2)\n",
+ "\n",
+ "Returns: \n",
+ "* a dict containing the parameters (multiple values will be in a list)"
+ ],
+ "text/plain": [
+ "parse_query
function.parse_query
functionThe second parameter to parse_query
is the Trove API version number. The default is 2
for backwards compatibility.
from trove_query_parser.parser import parse_query
-parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon')
+parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon', 3)
{'q': 'wragge', - 'zone': 'newspaper', + 'l-artType': 'newspapers', 'l-state': ['Queensland'], 'l-category': ['Article'], 'l-illustrated': 'true', - 'l-illtype': ['Cartoon']}+ 'l-illtype': ['Cartoon'], + 'category': 'newspaper'} @@ -116,5 +118,5 @@
format_date
-parse_query
[source]+
parse_query
(query
)
parse_query
[source]
parse_query
(query
,api_version
=2
)Converts the parameters of a search using the Trove web interface into a form the API will understand.
Parameters:
- +
query
– the url of a search in the Trove newspapers & gazettes categoryapi_version
– Trove API version (default is 2)Returns:
@@ -91,13 +99,6 @@
parse_query
-
params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon')
+params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon', 3)
params
@@ -139,11 +140,12 @@ Basic usage
If you want to use this to get data back from the Trove API, you'll need to add a key
parameter with your Trove API key. You might also want to change the encoding
of the results to 'json'. Then you can just give the parameters as params
to requests
. For example:
params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon')
-params['key'] = 'mYApiKEY'
+If you want to use this to get data back from the Trove API, you'll need to provide your Trove API key, either as a query parameter (version 2), or in the request headers (version 3). You might also want to change the encoding
of the results to 'json'. Then you can just give the parameters as params
to requests
. For example:
+params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon', 3)
+headers = {'X-API-KEY': 'mYApiKEY'}
params['encoding'] = 'json'
-response = requests.get('https://api.trove.nla.gov.au/v2/result', params=params)
+params['n'] = 1
+response = requests.get('https://api.trove.nla.gov.au/v3/result', params=params, headers=headers)
data = response.json()
Assuming your API key is valid, this will return the following results:
-{'response': {'query': 'wragge',
- 'zone': [{'name': 'newspaper',
- 'records': {'s': '*',
- 'n': '3',
- 'total': '3',
- 'article': [{'id': '76672882',
- 'url': '/newspaper/76672882',
- 'heading': 'THE POLITICAL STIGER YORTEX.',
- 'category': 'Article',
- 'title': {'id': '274',
- 'value': 'The Charleville Times (Brisbane, Qld. : 1896 - 1954)'},
- 'date': '1901-10-12',
- 'page': 4,
- 'pageSequence': 4,
- 'relevance': {'score': '250.99701', 'value': 'very relevant'},
- 'snippet': 'PREMIER PHILP: "I think that\'s a better shot than Wragge\'s."',
- 'troveUrl': 'https://trove.nla.gov.au/ndp/del/article/76672882?searchTerm=wragge'},
- {'id': '50294024',
- 'url': '/newspaper/50294024',
- 'heading': 'We nearly broke the drought (. . . WE THINK)',
- 'category': 'Article',
- 'title': {'id': '12',
- 'value': 'The Courier-Mail (Brisbane, Qld. : 1933 - 1954)'},
- 'date': '1952-02-16',
- 'page': 2,
- 'pageSequence': 2,
- 'relevance': {'score': '12.74085', 'value': 'very relevant'},
- 'snippet': 'WE were determined to try our hand at rainmaking, and',
- 'troveUrl': 'https://trove.nla.gov.au/ndp/del/article/50294024?searchTerm=wragge'},
- {'id': '76372015',
- 'url': '/newspaper/76372015',
- 'heading': 'Digest What YOU Eat.',
- 'category': 'Article',
- 'title': {'id': '266',
- 'value': 'The Western Champion and General Advertiser for the Central-Western Districts (Barcaldine, Qld. : 1892 - 1922)'},
- 'date': '1906-01-08',
- 'page': 5,
- 'pageSequence': 5,
- 'relevance': {'score': '5.734701', 'value': 'very relevant'},
- 'snippet': "The reason why any wholesome food is not properly digested is because the stomach lacks some important element of digestion. Some stomach' lack peptone, others are deficient in gastric juice or hydrochloric",
- 'troveUrl': 'https://trove.nla.gov.au/ndp/del/article/76372015?searchTerm=wragge'}]}}]}}
+{'query': 'wragge',
+ 'category': [{'code': 'newspaper',
+ 'name': 'Newspapers & Gazettes',
+ 'records': {'s': '*',
+ 'n': 2,
+ 'total': 510,
+ 'next': 'https://api.trove.nla.gov.au/v3/result?q=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrated=true&l-illtype=Cartoon&category=newspaper&encoding=json&n=2&s=AoIIQzWFoig4MjM0NjM1NA%3D%3D',
+ 'nextStart': 'AoIIQzWFoig4MjM0NjM1NA==',
+ 'article': [{'id': '21765046',
+ 'url': 'https://api.trove.nla.gov.au/v3/newspaper/21765046',
+ 'heading': 'Mrs. Adelaide Wragge.',
+ 'category': 'Article',
+ 'title': {'id': '16',
+ 'title': 'The Brisbane Courier (Qld. : 1864 - 1933)'},
+ 'date': '1931-12-16',
+ 'page': '13',
+ 'pageSequence': '13',
+ 'relevance': {'score': 215.65185546875, 'value': 'very relevant'},
+ 'snippet': 'Formerly of Victoria, and in 1864 Mayoress of Melbourne, the late Mrs. Wragge, who died recently, had been',
+ 'troveUrl': 'https://.nla.gov.au/nla.news-article21765046?searchTerm=wragge'},
+ {'id': '82346354',
+ 'url': 'https://api.trove.nla.gov.au/v3/newspaper/82346354',
+ 'heading': 'MR WRAGGE ON WEATHER CANNONS.',
+ 'category': 'Article',
+ 'title': {'id': '269',
+ 'title': 'The North Queensland Register (Townsville, Qld. : 1892 - 1905)'},
+ 'date': '1901-03-11',
+ 'page': '10',
+ 'pageSequence': '10',
+ 'relevance': {'score': 181.52200317382812, 'value': 'very relevant'},
+ 'snippet': 'I have been to Styria, have seen the cannons made in the forges, have witnessed the experiments, have visited Herr Stiger, the inventor of the',
+ 'troveUrl': 'https://.nla.gov.au/nla.news-article82346354?searchTerm=wragge'}]}}]}
@@ -219,7 +213,7 @@ Basic usage
params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%20weather', 3)
+assert {'q': 'wragge weather', 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Multiple keywords with OR
are passed along as is.
params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%20OR%20weather', 3)
+assert {'q': 'wragge OR weather', 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Phrase search passed along as is.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=%22inclement%20wragge%22', 3)
+assert {'q': '"inclement wragge"', 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
More complex queries such as date ranges should be passed along as is.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%20date%3A%5B1901%20TO%201903%5D&l-artType=newspapers', 3)
+assert {'q': 'wragge date:[1901 TO 1903]', 'category': 'newspaper', 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to gazettes using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=gazette', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-artType': 'gazette'} == params
+assert query_api(params) == 200
+
Limit state to NSW using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=New%20South%20Wales', 3)
+assert {'q': 'wragge', 'l-state': ['New South Wales'], 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Limit newspaper to SMH using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-title=35', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-title': ['35'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to 'Article' category using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-category=Article', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-category': ['Article'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to specific decade using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-decade=190', 3)
+assert {'q': 'wragge', 'l-artType': 'newspapers', 'l-decade': ['190'], 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Limit to specific year using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-decade=190&l-year=1903', 3)
+assert {'q': 'wragge', 'l-artType': 'newspapers', 'l-decade': ['190'], 'l-year': ['1903'], 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Limit to articles with illustration type of 'Photo' with facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-illustrationType=Photo', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-illustrated': 'true', 'l-illtype': ['Photo'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to articles containing more than 1,000 words using facets.
+ +params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-word=1000%2B%20Words', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-word': ['1000+ Words'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Multiple keywords in 'Any of these words' box.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword.any=wragge%20weather', 3)
+assert {'q': '(wragge OR weather)', 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Multiple keywords in 'The phrase' box.
params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword.phrase=inclement%20wragge', 3)
+assert {'q': '"inclement wragge"', 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Keywords in 'All of these words' and 'Without these words' boxes.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword.not=weather&keyword=wragge', 3)
+assert {'q': 'wragge AND NOT (weather)', 'category': 'newspaper'} == params
+assert query_api(params) == 200
+
Limit to a specific date range.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&date.from=1900-01-01&date.to=1900-02-04&l-advArtType=newspapers', 3)
+assert {'q': 'wragge date:[1899-12-31T00:00:00Z TO 1900-02-04T00:00:00Z]', 'category': 'newspaper', 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to a specific state.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advstate=Queensland', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-state': ['Queensland'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to specific newspapers.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advtitle=16&l-advtitle=1055', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-title': ['16', '1055'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to a specific category.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advcategory=Family%20Notices', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-category': ['Family Notices'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to a specific illustration type.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advIllustrationType=Photo', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-illustrated': 'true', 'l-illtype': ['Photo'], 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+
Limit to a specific number of words.
+ +params = parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advWord=100%20-%201000%20Words', 3)
+assert {'q': 'wragge', 'category': 'newspaper', 'l-word': '100 - 1000 Words', 'l-artType': 'newspapers'} == params
+assert query_api(params) == 200
+