Skip to content

Commit

Permalink
Merge pull request #3 from CristianCantoro/master
Browse files Browse the repository at this point in the history
Proposal for support of multiple languages
  • Loading branch information
wetneb committed May 27, 2016
2 parents 1af8fd3 + 450296e commit b5988ab
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 53 deletions.
32 changes: 32 additions & 0 deletions wikiciteparser/en/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# -*- encoding: utf-8 -*-

# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1
citation_template_names = set([
'Citation',
'Cite AV media',
'Cite AV media notes',
'Cite book',
'Cite conference',
'Cite DVD notes',
'Cite encyclopedia',
'Cite episode',
'Cite interview',
'Cite journal',
'Cite mailing list',
'Cite map',
'Cite news',
'Cite newsgroup',
'Cite podcast',
'Cite press release',
'Cite report',
'Cite serial',
'Cite sign',
'Cite speech',
'Cite techreport',
'Cite thesis',
'Cite web',
'Cite arXiv',
# TODO more could be added,
# see https://en.wikipedia.org/wiki/
# Category:Citation_Style_1_specific-source_templates
])
9 changes: 9 additions & 0 deletions wikiciteparser/it/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- encoding: utf-8 -*-

# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1
citation_template_names = set([
'Cita pubblicazione',
# TODO more could be added,
# see https://en.wikipedia.org/wiki/
# Category:Citation_Style_1_specific-source_templates
])
56 changes: 17 additions & 39 deletions wikiciteparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,16 @@
import mwparserfromhell
from time import sleep

from . import en
from . import it


lua = lupa.LuaRuntime()
luacode = ''
luafilepath = os.path.join(os.path.dirname(__file__), 'cs1.lua')
with open(luafilepath, 'r') as f:
luacode = f.read()

# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1
citation_template_names = set([
'Citation',
'Cite AV media',
'Cite AV media notes',
'Cite book',
'Cite conference',
'Cite DVD notes',
'Cite encyclopedia',
'Cite episode',
'Cite interview',
'Cite journal',
'Cite mailing list',
'Cite map',
'Cite news',
'Cite newsgroup',
'Cite podcast',
'Cite press release',
'Cite report',
'Cite serial',
'Cite sign',
'Cite speech',
'Cite techreport',
'Cite thesis',
'Cite web',
'Cite arXiv',
# TODO more could be added,
# see https://en.wikipedia.org/wiki/Category:Citation_Style_1_specific-source_templates
])

# MediaWiki utilities simulated by Python wrappers
def lua_to_python_re(regex):
rx = re.sub('%a', '[a-zA-Z]', regex) # letters
Expand Down Expand Up @@ -141,31 +115,35 @@ def params_to_dict(params):
dct[param.name.strip()] = param.value.strip()
return dct

def is_citation_template_name(template_name):

def is_citation_template_name(template_name, lang='en'):
"""
Is this name the name of a citation template?
If true, returns a normalized version of it. Otherwise, returns None
"""
if not template_name:
return False

template_name = template_name.replace('_', ' ')
template_name = template_name.strip()
template_name = template_name[0].upper()+template_name[1:]
if template_name in citation_template_names:

lang_module = __import__(lang)
if template_name in lang_module.citation_template_names:
return template_name

def parse_citation_template(template):

def parse_citation_template(template, lang='en'):
"""
Takes a mwparserfromhell template object that represents
a wikipedia citation, and converts it to a normalized representation
as a dict.
:returns: a dict representing the template, or None if the template provided
does not represent a citation.
:returns: a dict representing the template, or None if the template
provided does not represent a citation.
"""
name = unicode(template.name)
if not is_citation_template_name(name):
if not is_citation_template_name(name, lang):
return
return parse_citation_dict(params_to_dict(template.params), template_name=name)


return parse_citation_dict(params_to_dict(template.params),
template_name=name)
135 changes: 121 additions & 14 deletions wikiciteparser/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,134 @@
import unittest
from wikiciteparser.parser import *


class ParsingTests(unittest.TestCase):
def test_multiple_authors(self):
p = parse_citation_dict({"doi": "10.1111/j.1365-2486.2008.01559.x", "title": "Climate change, plant migration, and range collapse in a global biodiversity hotspot: the ''Banksia'' (Proteaceae) of Western Australia", "issue": "6", "journal": "Global Change Biology", "year": "2008", "volume": "14", "last4": "Dunn", "last1": "Fitzpatrick", "last3": "Sanders", "last2": "Gove", "first1": "Matthew C.", "first2": "Aaron D.", "first3": "Nathan J.", "first4": "Robert R.", "pages": "1\u201316"}, template_name='cite journal')
self.assertEqual(p['Authors'],[{'last': 'Fitzpatrick', 'first': 'Matthew C.'}, {'last': 'Gove', 'first': 'Aaron D.'}, {'last': 'Sanders', 'first': 'Nathan J.'}, {'last': 'Dunn', 'first': 'Robert R.'}])
p = parse_citation_dict({"doi": "10.1111/j.1365-2486.2008.01559.x",
"title": "Climate change, plant migration, and range collapse in a global biodiversity hotspot: the ''Banksia'' (Proteaceae) of Western Australia",
"issue": "6",
"journal": "Global Change Biology",
"year": "2008",
"volume": "14",
"last4": "Dunn",
"last1": "Fitzpatrick",
"last3": "Sanders",
"last2": "Gove", "first1":
"Matthew C.",
"first2": "Aaron D.",
"first3": "Nathan J.",
"first4": "Robert R.",
"pages": "1\u201316"
},
template_name='cite journal')
self.assertEqual(p['Authors'], [{'last': 'Fitzpatrick',
'first': 'Matthew C.'
},
{'last': 'Gove',
'first': 'Aaron D.'},
{'last': 'Sanders',
'first': 'Nathan J.'},
{'last': 'Dunn',
'first': 'Robert R.'
}
])

def test_vauthors(self):
p = parse_citation_dict({"doi": "10.1016/s1097-2765(00)80111-2", "title": "SAP30, a component of the mSin3 corepressor complex involved in N-CoR-mediated repression by specific transcription factors", "journal": "Mol. Cell", "volume": "2", "date": "July 1998", "pmid": "9702189", "issue": "1", "pages": "33\u201342", "vauthors": "Laherty CD, Billin AN, Lavinsky RM, Yochum GS, Bush AC, Sun JM, Mullen TM, Davie JR, Rose DW, Glass CK, Rosenfeld MG, Ayer DE, Eisenman RN"}, template_name='cite journal')
self.assertEqual(p['Authors'],[{'last': 'Laherty', 'first': 'CD'}, {'last': 'Billin', 'first': 'AN'}, {'last': 'Lavinsky', 'first': 'RM'}, {'last': 'Yochum', 'first': 'GS'}, {'last': 'Bush', 'first': 'AC'}, {'last': 'Sun', 'first': 'JM'}, {'last': 'Mullen', 'first': 'TM'}, {'last': 'Davie', 'first': 'JR'}, {'last': 'Rose', 'first': 'DW'}, {'last': 'Glass', 'first': 'CK'}, {'last': 'Rosenfeld', 'first': 'MG'}, {'last': 'Ayer', 'first': 'DE'}, {'last': 'Eisenman', 'first': 'RN'}])
p = parse_citation_dict({"doi": "10.1016/s1097-2765(00)80111-2",
"title": "SAP30, a component of the mSin3 corepressor complex involved in N-CoR-mediated repression by specific transcription factors",
"journal": "Mol. Cell",
"volume": "2",
"date": "July 1998",
"pmid": "9702189",
"issue": "1",
"pages": "33\u201342",
"vauthors": "Laherty CD, Billin AN, Lavinsky RM, Yochum GS, Bush AC, Sun JM, Mullen TM, Davie JR, Rose DW, Glass CK, Rosenfeld MG, Ayer DE, Eisenman RN"
},
template_name='cite journal')
self.assertEqual(p['Authors'], [{'last': 'Laherty',
'first': 'CD'
},
{'last': 'Billin',
'first': 'AN'
},
{'last': 'Lavinsky',
'first': 'RM'
},
{'last': 'Yochum',
'first': 'GS'
},
{'last': 'Bush',
'first': 'AC'
},
{'last': 'Sun',
'first': 'JM'
},
{'last': 'Mullen',
'first': 'TM'
},
{'last': 'Davie',
'first': 'JR'
},
{'last': 'Rose',
'first': 'DW'
},
{'last': 'Glass',
'first': 'CK'
},
{'last': 'Rosenfeld',
'first': 'MG'
},
{'last': 'Ayer',
'first': 'DE'
},
{'last': 'Eisenman',
'first': 'RN'
}
])

def test_remove_links(self):
p = parse_citation_dict({"title": "Mobile, Alabama", "url": "http://archive.org/stream/ballouspictorial1112ball#page/408/mode/2up", "journal": "[[Ballou's Pictorial Drawing-Room Companion]]", "volume": "12", "location": "Boston", "date": "June 27, 1857"}, template_name='cite journal')
self.assertEqual(p['Periodical'], "Ballou's Pictorial Drawing-Room Companion")
p = parse_citation_dict({"title": "Mobile, Alabama",
"url": "http://archive.org/stream/ballouspictorial1112ball#page/408/mode/2up",
"journal": "[[Ballou's Pictorial Drawing-Room Companion]]",
"volume": "12",
"location": "Boston",
"date": "June 27, 1857"
},
template_name='cite journal')
self.assertEqual(p['Periodical'],
"Ballou's Pictorial Drawing-Room Companion")

def test_authorlink(self):
p = parse_citation_dict({"publisher": "[[World Bank]]", "isbn": "978-0821369418", "title": "Performance Accountability and Combating Corruption", "url": "http://siteresources.worldbank.org/INTWBIGOVANTCOR/Resources/DisruptingCorruption.pdf", "page": "309", "last1": "Shah", "location": "[[Washington, D.C.]], [[United States|U.S.]]", "year": "2007", "first1": "Anwar", "authorlink1": "Anwar Shah", "oclc": "77116846"}, template_name='citation')
self.assertEqual(p['Authors'], [{'link': 'Anwar Shah', 'last': 'Shah', 'first': 'Anwar'}])
p = parse_citation_dict({"publisher": "[[World Bank]]",
"isbn": "978-0821369418",
"title": "Performance Accountability and Combating Corruption",
"url": "http://siteresources.worldbank.org/INTWBIGOVANTCOR/Resources/DisruptingCorruption.pdf",
"page": "309",
"last1": "Shah",
"location": "[[Washington, D.C.]], [[United States|U.S.]]",
"year": "2007",
"first1": "Anwar",
"authorlink1": "Anwar Shah",
"oclc": "77116846"
},
template_name='citation')
self.assertEqual(p['Authors'], [{'link': 'Anwar Shah',
'last': 'Shah',
'first': 'Anwar'
}
])

def test_unicode(self):
p = parse_citation_dict({"title": "\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)", "url": "http://magazines.russ.ru/ural/2004/10/mar11.html", "journal": "\u0423\u0440\u0430\u043b", "author": "Margovenko, A", "volume": "10", "year": "2004"}, template_name='cite journal')
self.assertEqual(p['Title'], '\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)')

p = parse_citation_dict({"title": "\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)",
"url": "http://magazines.russ.ru/ural/2004/10/mar11.html",
"journal": "\u0423\u0440\u0430\u043b",
"author": "Margovenko, A",
"volume": "10",
"year": "2004"
},
template_name='cite journal')
self.assertEqual(p['Title'],
'\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)')

def test_mwtext(self):
# taken from https://en.wikipedia.org/wiki/Joachim_Lambek
Expand All @@ -40,11 +147,11 @@ def test_mwtext(self):
"""
wikicode = mwparserfromhell.parse(mwtext)
for tpl in wikicode.filter_templates():
parsed = parse_citation_template(tpl)
parsed = parse_citation_template(tpl, 'en')
print parsed
self.assertIsInstance(parsed, dict) # because all templates in this example are citation templates
# All templates in this example are citation templates
self.assertIsInstance(parsed, dict)


if __name__ == '__main__':
unittest.main()

0 comments on commit b5988ab

Please sign in to comment.