From 195d0b1477e838718298e3b59bbdde9b48a81f61 Mon Sep 17 00:00:00 2001 From: Cristian Consonni Date: Fri, 27 May 2016 15:17:26 +0200 Subject: [PATCH 1/2] Proposal for support of multiple languages --- wikiciteparser/en/__init__.py | 32 ++++++++ wikiciteparser/it/__init__.py | 9 +++ wikiciteparser/parser.py | 26 ++++--- wikiciteparser/tests.py | 135 ++++++++++++++++++++++++++++++---- 4 files changed, 179 insertions(+), 23 deletions(-) create mode 100644 wikiciteparser/en/__init__.py create mode 100644 wikiciteparser/it/__init__.py diff --git a/wikiciteparser/en/__init__.py b/wikiciteparser/en/__init__.py new file mode 100644 index 0000000..5a03ab8 --- /dev/null +++ b/wikiciteparser/en/__init__.py @@ -0,0 +1,32 @@ +# -*- encoding: utf-8 -*- + +# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1 +citation_template_names = set([ + 'Citation', + 'Cite AV media', + 'Cite AV media notes', + 'Cite book', + 'Cite conference', + 'Cite DVD notes', + 'Cite encyclopedia', + 'Cite episode', + 'Cite interview', + 'Cite journal', + 'Cite mailing list', + 'Cite map', + 'Cite news', + 'Cite newsgroup', + 'Cite podcast', + 'Cite press release', + 'Cite report', + 'Cite serial', + 'Cite sign', + 'Cite speech', + 'Cite techreport', + 'Cite thesis', + 'Cite web', + 'Cite arXiv', + # TODO more could be added, + # see https://en.wikipedia.org/wiki/ + # Category:Citation_Style_1_specific-source_templates + ]) diff --git a/wikiciteparser/it/__init__.py b/wikiciteparser/it/__init__.py new file mode 100644 index 0000000..494f0ef --- /dev/null +++ b/wikiciteparser/it/__init__.py @@ -0,0 +1,9 @@ +# -*- encoding: utf-8 -*- + +# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1 +citation_template_names = set([ + 'Cita pubblicazione', + # TODO more could be added, + # see https://en.wikipedia.org/wiki/ + # Category:Citation_Style_1_specific-source_templates + ]) diff --git a/wikiciteparser/parser.py b/wikiciteparser/parser.py index 4852917..ae7bce8 100644 --- a/wikiciteparser/parser.py +++ b/wikiciteparser/parser.py @@ -8,6 +8,10 @@ import mwparserfromhell from time import sleep +from . import en +from . import it + + lua = lupa.LuaRuntime() luacode = '' luafilepath = os.path.join(os.path.dirname(__file__), 'cs1.lua') @@ -141,31 +145,35 @@ def params_to_dict(params): dct[param.name.strip()] = param.value.strip() return dct -def is_citation_template_name(template_name): + +def is_citation_template_name(template_name, lang='en'): """ Is this name the name of a citation template? If true, returns a normalized version of it. Otherwise, returns None """ if not template_name: return False + template_name = template_name.replace('_', ' ') template_name = template_name.strip() template_name = template_name[0].upper()+template_name[1:] - if template_name in citation_template_names: + + lang_module = __import__(lang) + if template_name in lang_module.citation_template_names: return template_name -def parse_citation_template(template): + +def parse_citation_template(template, lang='en'): """ Takes a mwparserfromhell template object that represents a wikipedia citation, and converts it to a normalized representation as a dict. - :returns: a dict representing the template, or None if the template provided - does not represent a citation. + :returns: a dict representing the template, or None if the template + provided does not represent a citation. """ name = unicode(template.name) - if not is_citation_template_name(name): + if not is_citation_template_name(name, lang): return - return parse_citation_dict(params_to_dict(template.params), template_name=name) - - + return parse_citation_dict(params_to_dict(template.params), + template_name=name) diff --git a/wikiciteparser/tests.py b/wikiciteparser/tests.py index 418155e..6ff95f1 100644 --- a/wikiciteparser/tests.py +++ b/wikiciteparser/tests.py @@ -4,27 +4,134 @@ import unittest from wikiciteparser.parser import * + class ParsingTests(unittest.TestCase): def test_multiple_authors(self): - p = parse_citation_dict({"doi": "10.1111/j.1365-2486.2008.01559.x", "title": "Climate change, plant migration, and range collapse in a global biodiversity hotspot: the ''Banksia'' (Proteaceae) of Western Australia", "issue": "6", "journal": "Global Change Biology", "year": "2008", "volume": "14", "last4": "Dunn", "last1": "Fitzpatrick", "last3": "Sanders", "last2": "Gove", "first1": "Matthew C.", "first2": "Aaron D.", "first3": "Nathan J.", "first4": "Robert R.", "pages": "1\u201316"}, template_name='cite journal') - self.assertEqual(p['Authors'],[{'last': 'Fitzpatrick', 'first': 'Matthew C.'}, {'last': 'Gove', 'first': 'Aaron D.'}, {'last': 'Sanders', 'first': 'Nathan J.'}, {'last': 'Dunn', 'first': 'Robert R.'}]) + p = parse_citation_dict({"doi": "10.1111/j.1365-2486.2008.01559.x", + "title": "Climate change, plant migration, and range collapse in a global biodiversity hotspot: the ''Banksia'' (Proteaceae) of Western Australia", + "issue": "6", + "journal": "Global Change Biology", + "year": "2008", + "volume": "14", + "last4": "Dunn", + "last1": "Fitzpatrick", + "last3": "Sanders", + "last2": "Gove", "first1": + "Matthew C.", + "first2": "Aaron D.", + "first3": "Nathan J.", + "first4": "Robert R.", + "pages": "1\u201316" + }, + template_name='cite journal') + self.assertEqual(p['Authors'], [{'last': 'Fitzpatrick', + 'first': 'Matthew C.' + }, + {'last': 'Gove', + 'first': 'Aaron D.'}, + {'last': 'Sanders', + 'first': 'Nathan J.'}, + {'last': 'Dunn', + 'first': 'Robert R.' + } + ]) def test_vauthors(self): - p = parse_citation_dict({"doi": "10.1016/s1097-2765(00)80111-2", "title": "SAP30, a component of the mSin3 corepressor complex involved in N-CoR-mediated repression by specific transcription factors", "journal": "Mol. Cell", "volume": "2", "date": "July 1998", "pmid": "9702189", "issue": "1", "pages": "33\u201342", "vauthors": "Laherty CD, Billin AN, Lavinsky RM, Yochum GS, Bush AC, Sun JM, Mullen TM, Davie JR, Rose DW, Glass CK, Rosenfeld MG, Ayer DE, Eisenman RN"}, template_name='cite journal') - self.assertEqual(p['Authors'],[{'last': 'Laherty', 'first': 'CD'}, {'last': 'Billin', 'first': 'AN'}, {'last': 'Lavinsky', 'first': 'RM'}, {'last': 'Yochum', 'first': 'GS'}, {'last': 'Bush', 'first': 'AC'}, {'last': 'Sun', 'first': 'JM'}, {'last': 'Mullen', 'first': 'TM'}, {'last': 'Davie', 'first': 'JR'}, {'last': 'Rose', 'first': 'DW'}, {'last': 'Glass', 'first': 'CK'}, {'last': 'Rosenfeld', 'first': 'MG'}, {'last': 'Ayer', 'first': 'DE'}, {'last': 'Eisenman', 'first': 'RN'}]) + p = parse_citation_dict({"doi": "10.1016/s1097-2765(00)80111-2", + "title": "SAP30, a component of the mSin3 corepressor complex involved in N-CoR-mediated repression by specific transcription factors", + "journal": "Mol. Cell", + "volume": "2", + "date": "July 1998", + "pmid": "9702189", + "issue": "1", + "pages": "33\u201342", + "vauthors": "Laherty CD, Billin AN, Lavinsky RM, Yochum GS, Bush AC, Sun JM, Mullen TM, Davie JR, Rose DW, Glass CK, Rosenfeld MG, Ayer DE, Eisenman RN" + }, + template_name='cite journal') + self.assertEqual(p['Authors'], [{'last': 'Laherty', + 'first': 'CD' + }, + {'last': 'Billin', + 'first': 'AN' + }, + {'last': 'Lavinsky', + 'first': 'RM' + }, + {'last': 'Yochum', + 'first': 'GS' + }, + {'last': 'Bush', + 'first': 'AC' + }, + {'last': 'Sun', + 'first': 'JM' + }, + {'last': 'Mullen', + 'first': 'TM' + }, + {'last': 'Davie', + 'first': 'JR' + }, + {'last': 'Rose', + 'first': 'DW' + }, + {'last': 'Glass', + 'first': 'CK' + }, + {'last': 'Rosenfeld', + 'first': 'MG' + }, + {'last': 'Ayer', + 'first': 'DE' + }, + {'last': 'Eisenman', + 'first': 'RN' + } + ]) def test_remove_links(self): - p = parse_citation_dict({"title": "Mobile, Alabama", "url": "http://archive.org/stream/ballouspictorial1112ball#page/408/mode/2up", "journal": "[[Ballou's Pictorial Drawing-Room Companion]]", "volume": "12", "location": "Boston", "date": "June 27, 1857"}, template_name='cite journal') - self.assertEqual(p['Periodical'], "Ballou's Pictorial Drawing-Room Companion") + p = parse_citation_dict({"title": "Mobile, Alabama", + "url": "http://archive.org/stream/ballouspictorial1112ball#page/408/mode/2up", + "journal": "[[Ballou's Pictorial Drawing-Room Companion]]", + "volume": "12", + "location": "Boston", + "date": "June 27, 1857" + }, + template_name='cite journal') + self.assertEqual(p['Periodical'], + "Ballou's Pictorial Drawing-Room Companion") def test_authorlink(self): - p = parse_citation_dict({"publisher": "[[World Bank]]", "isbn": "978-0821369418", "title": "Performance Accountability and Combating Corruption", "url": "http://siteresources.worldbank.org/INTWBIGOVANTCOR/Resources/DisruptingCorruption.pdf", "page": "309", "last1": "Shah", "location": "[[Washington, D.C.]], [[United States|U.S.]]", "year": "2007", "first1": "Anwar", "authorlink1": "Anwar Shah", "oclc": "77116846"}, template_name='citation') - self.assertEqual(p['Authors'], [{'link': 'Anwar Shah', 'last': 'Shah', 'first': 'Anwar'}]) + p = parse_citation_dict({"publisher": "[[World Bank]]", + "isbn": "978-0821369418", + "title": "Performance Accountability and Combating Corruption", + "url": "http://siteresources.worldbank.org/INTWBIGOVANTCOR/Resources/DisruptingCorruption.pdf", + "page": "309", + "last1": "Shah", + "location": "[[Washington, D.C.]], [[United States|U.S.]]", + "year": "2007", + "first1": "Anwar", + "authorlink1": "Anwar Shah", + "oclc": "77116846" + }, + template_name='citation') + self.assertEqual(p['Authors'], [{'link': 'Anwar Shah', + 'last': 'Shah', + 'first': 'Anwar' + } + ]) def test_unicode(self): - p = parse_citation_dict({"title": "\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)", "url": "http://magazines.russ.ru/ural/2004/10/mar11.html", "journal": "\u0423\u0440\u0430\u043b", "author": "Margovenko, A", "volume": "10", "year": "2004"}, template_name='cite journal') - self.assertEqual(p['Title'], '\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)') - + p = parse_citation_dict({"title": "\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)", + "url": "http://magazines.russ.ru/ural/2004/10/mar11.html", + "journal": "\u0423\u0440\u0430\u043b", + "author": "Margovenko, A", + "volume": "10", + "year": "2004" + }, + template_name='cite journal') + self.assertEqual(p['Title'], + '\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)') def test_mwtext(self): # taken from https://en.wikipedia.org/wiki/Joachim_Lambek @@ -40,11 +147,11 @@ def test_mwtext(self): """ wikicode = mwparserfromhell.parse(mwtext) for tpl in wikicode.filter_templates(): - parsed = parse_citation_template(tpl) + parsed = parse_citation_template(tpl, 'en') print parsed - self.assertIsInstance(parsed, dict) # because all templates in this example are citation templates + # All templates in this example are citation templates + self.assertIsInstance(parsed, dict) if __name__ == '__main__': unittest.main() - From 450296e7103683ae27b542d488c150c1f677873e Mon Sep 17 00:00:00 2001 From: Cristian Consonni Date: Fri, 27 May 2016 15:25:25 +0200 Subject: [PATCH 2/2] Remove definition of citation_template_names for enwiki from parser.py, moved to wikiciteparser/en/__init__.py --- wikiciteparser/parser.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/wikiciteparser/parser.py b/wikiciteparser/parser.py index ae7bce8..5a33b2c 100644 --- a/wikiciteparser/parser.py +++ b/wikiciteparser/parser.py @@ -18,36 +18,6 @@ with open(luafilepath, 'r') as f: luacode = f.read() -# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1 -citation_template_names = set([ - 'Citation', - 'Cite AV media', - 'Cite AV media notes', - 'Cite book', - 'Cite conference', - 'Cite DVD notes', - 'Cite encyclopedia', - 'Cite episode', - 'Cite interview', - 'Cite journal', - 'Cite mailing list', - 'Cite map', - 'Cite news', - 'Cite newsgroup', - 'Cite podcast', - 'Cite press release', - 'Cite report', - 'Cite serial', - 'Cite sign', - 'Cite speech', - 'Cite techreport', - 'Cite thesis', - 'Cite web', - 'Cite arXiv', - # TODO more could be added, - # see https://en.wikipedia.org/wiki/Category:Citation_Style_1_specific-source_templates - ]) - # MediaWiki utilities simulated by Python wrappers def lua_to_python_re(regex): rx = re.sub('%a', '[a-zA-Z]', regex) # letters