Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proposal for support of multiple languages #3

Merged
merged 2 commits into from
May 27, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions wikiciteparser/en/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# -*- encoding: utf-8 -*-

# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1
citation_template_names = set([
'Citation',
'Cite AV media',
'Cite AV media notes',
'Cite book',
'Cite conference',
'Cite DVD notes',
'Cite encyclopedia',
'Cite episode',
'Cite interview',
'Cite journal',
'Cite mailing list',
'Cite map',
'Cite news',
'Cite newsgroup',
'Cite podcast',
'Cite press release',
'Cite report',
'Cite serial',
'Cite sign',
'Cite speech',
'Cite techreport',
'Cite thesis',
'Cite web',
'Cite arXiv',
# TODO more could be added,
# see https://en.wikipedia.org/wiki/
# Category:Citation_Style_1_specific-source_templates
])
9 changes: 9 additions & 0 deletions wikiciteparser/it/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- encoding: utf-8 -*-

# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1
citation_template_names = set([
'Cita pubblicazione',
# TODO more could be added,
# see https://en.wikipedia.org/wiki/
# Category:Citation_Style_1_specific-source_templates
])
56 changes: 17 additions & 39 deletions wikiciteparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,16 @@
import mwparserfromhell
from time import sleep

from . import en
from . import it


lua = lupa.LuaRuntime()
luacode = ''
luafilepath = os.path.join(os.path.dirname(__file__), 'cs1.lua')
with open(luafilepath, 'r') as f:
luacode = f.read()

# taken from https://en.wikipedia.org/wiki/Help:Citation_Style_1
citation_template_names = set([
'Citation',
'Cite AV media',
'Cite AV media notes',
'Cite book',
'Cite conference',
'Cite DVD notes',
'Cite encyclopedia',
'Cite episode',
'Cite interview',
'Cite journal',
'Cite mailing list',
'Cite map',
'Cite news',
'Cite newsgroup',
'Cite podcast',
'Cite press release',
'Cite report',
'Cite serial',
'Cite sign',
'Cite speech',
'Cite techreport',
'Cite thesis',
'Cite web',
'Cite arXiv',
# TODO more could be added,
# see https://en.wikipedia.org/wiki/Category:Citation_Style_1_specific-source_templates
])

# MediaWiki utilities simulated by Python wrappers
def lua_to_python_re(regex):
rx = re.sub('%a', '[a-zA-Z]', regex) # letters
Expand Down Expand Up @@ -141,31 +115,35 @@ def params_to_dict(params):
dct[param.name.strip()] = param.value.strip()
return dct

def is_citation_template_name(template_name):

def is_citation_template_name(template_name, lang='en'):
"""
Is this name the name of a citation template?
If true, returns a normalized version of it. Otherwise, returns None
"""
if not template_name:
return False

template_name = template_name.replace('_', ' ')
template_name = template_name.strip()
template_name = template_name[0].upper()+template_name[1:]
if template_name in citation_template_names:

lang_module = __import__(lang)
if template_name in lang_module.citation_template_names:
return template_name

def parse_citation_template(template):

def parse_citation_template(template, lang='en'):
"""
Takes a mwparserfromhell template object that represents
a wikipedia citation, and converts it to a normalized representation
as a dict.

:returns: a dict representing the template, or None if the template provided
does not represent a citation.
:returns: a dict representing the template, or None if the template
provided does not represent a citation.
"""
name = unicode(template.name)
if not is_citation_template_name(name):
if not is_citation_template_name(name, lang):
return
return parse_citation_dict(params_to_dict(template.params), template_name=name)


return parse_citation_dict(params_to_dict(template.params),
template_name=name)
135 changes: 121 additions & 14 deletions wikiciteparser/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,134 @@
import unittest
from wikiciteparser.parser import *


class ParsingTests(unittest.TestCase):
def test_multiple_authors(self):
p = parse_citation_dict({"doi": "10.1111/j.1365-2486.2008.01559.x", "title": "Climate change, plant migration, and range collapse in a global biodiversity hotspot: the ''Banksia'' (Proteaceae) of Western Australia", "issue": "6", "journal": "Global Change Biology", "year": "2008", "volume": "14", "last4": "Dunn", "last1": "Fitzpatrick", "last3": "Sanders", "last2": "Gove", "first1": "Matthew C.", "first2": "Aaron D.", "first3": "Nathan J.", "first4": "Robert R.", "pages": "1\u201316"}, template_name='cite journal')
self.assertEqual(p['Authors'],[{'last': 'Fitzpatrick', 'first': 'Matthew C.'}, {'last': 'Gove', 'first': 'Aaron D.'}, {'last': 'Sanders', 'first': 'Nathan J.'}, {'last': 'Dunn', 'first': 'Robert R.'}])
p = parse_citation_dict({"doi": "10.1111/j.1365-2486.2008.01559.x",
"title": "Climate change, plant migration, and range collapse in a global biodiversity hotspot: the ''Banksia'' (Proteaceae) of Western Australia",
"issue": "6",
"journal": "Global Change Biology",
"year": "2008",
"volume": "14",
"last4": "Dunn",
"last1": "Fitzpatrick",
"last3": "Sanders",
"last2": "Gove", "first1":
"Matthew C.",
"first2": "Aaron D.",
"first3": "Nathan J.",
"first4": "Robert R.",
"pages": "1\u201316"
},
template_name='cite journal')
self.assertEqual(p['Authors'], [{'last': 'Fitzpatrick',
'first': 'Matthew C.'
},
{'last': 'Gove',
'first': 'Aaron D.'},
{'last': 'Sanders',
'first': 'Nathan J.'},
{'last': 'Dunn',
'first': 'Robert R.'
}
])

def test_vauthors(self):
p = parse_citation_dict({"doi": "10.1016/s1097-2765(00)80111-2", "title": "SAP30, a component of the mSin3 corepressor complex involved in N-CoR-mediated repression by specific transcription factors", "journal": "Mol. Cell", "volume": "2", "date": "July 1998", "pmid": "9702189", "issue": "1", "pages": "33\u201342", "vauthors": "Laherty CD, Billin AN, Lavinsky RM, Yochum GS, Bush AC, Sun JM, Mullen TM, Davie JR, Rose DW, Glass CK, Rosenfeld MG, Ayer DE, Eisenman RN"}, template_name='cite journal')
self.assertEqual(p['Authors'],[{'last': 'Laherty', 'first': 'CD'}, {'last': 'Billin', 'first': 'AN'}, {'last': 'Lavinsky', 'first': 'RM'}, {'last': 'Yochum', 'first': 'GS'}, {'last': 'Bush', 'first': 'AC'}, {'last': 'Sun', 'first': 'JM'}, {'last': 'Mullen', 'first': 'TM'}, {'last': 'Davie', 'first': 'JR'}, {'last': 'Rose', 'first': 'DW'}, {'last': 'Glass', 'first': 'CK'}, {'last': 'Rosenfeld', 'first': 'MG'}, {'last': 'Ayer', 'first': 'DE'}, {'last': 'Eisenman', 'first': 'RN'}])
p = parse_citation_dict({"doi": "10.1016/s1097-2765(00)80111-2",
"title": "SAP30, a component of the mSin3 corepressor complex involved in N-CoR-mediated repression by specific transcription factors",
"journal": "Mol. Cell",
"volume": "2",
"date": "July 1998",
"pmid": "9702189",
"issue": "1",
"pages": "33\u201342",
"vauthors": "Laherty CD, Billin AN, Lavinsky RM, Yochum GS, Bush AC, Sun JM, Mullen TM, Davie JR, Rose DW, Glass CK, Rosenfeld MG, Ayer DE, Eisenman RN"
},
template_name='cite journal')
self.assertEqual(p['Authors'], [{'last': 'Laherty',
'first': 'CD'
},
{'last': 'Billin',
'first': 'AN'
},
{'last': 'Lavinsky',
'first': 'RM'
},
{'last': 'Yochum',
'first': 'GS'
},
{'last': 'Bush',
'first': 'AC'
},
{'last': 'Sun',
'first': 'JM'
},
{'last': 'Mullen',
'first': 'TM'
},
{'last': 'Davie',
'first': 'JR'
},
{'last': 'Rose',
'first': 'DW'
},
{'last': 'Glass',
'first': 'CK'
},
{'last': 'Rosenfeld',
'first': 'MG'
},
{'last': 'Ayer',
'first': 'DE'
},
{'last': 'Eisenman',
'first': 'RN'
}
])

def test_remove_links(self):
p = parse_citation_dict({"title": "Mobile, Alabama", "url": "http://archive.org/stream/ballouspictorial1112ball#page/408/mode/2up", "journal": "[[Ballou's Pictorial Drawing-Room Companion]]", "volume": "12", "location": "Boston", "date": "June 27, 1857"}, template_name='cite journal')
self.assertEqual(p['Periodical'], "Ballou's Pictorial Drawing-Room Companion")
p = parse_citation_dict({"title": "Mobile, Alabama",
"url": "http://archive.org/stream/ballouspictorial1112ball#page/408/mode/2up",
"journal": "[[Ballou's Pictorial Drawing-Room Companion]]",
"volume": "12",
"location": "Boston",
"date": "June 27, 1857"
},
template_name='cite journal')
self.assertEqual(p['Periodical'],
"Ballou's Pictorial Drawing-Room Companion")

def test_authorlink(self):
p = parse_citation_dict({"publisher": "[[World Bank]]", "isbn": "978-0821369418", "title": "Performance Accountability and Combating Corruption", "url": "http://siteresources.worldbank.org/INTWBIGOVANTCOR/Resources/DisruptingCorruption.pdf", "page": "309", "last1": "Shah", "location": "[[Washington, D.C.]], [[United States|U.S.]]", "year": "2007", "first1": "Anwar", "authorlink1": "Anwar Shah", "oclc": "77116846"}, template_name='citation')
self.assertEqual(p['Authors'], [{'link': 'Anwar Shah', 'last': 'Shah', 'first': 'Anwar'}])
p = parse_citation_dict({"publisher": "[[World Bank]]",
"isbn": "978-0821369418",
"title": "Performance Accountability and Combating Corruption",
"url": "http://siteresources.worldbank.org/INTWBIGOVANTCOR/Resources/DisruptingCorruption.pdf",
"page": "309",
"last1": "Shah",
"location": "[[Washington, D.C.]], [[United States|U.S.]]",
"year": "2007",
"first1": "Anwar",
"authorlink1": "Anwar Shah",
"oclc": "77116846"
},
template_name='citation')
self.assertEqual(p['Authors'], [{'link': 'Anwar Shah',
'last': 'Shah',
'first': 'Anwar'
}
])

def test_unicode(self):
p = parse_citation_dict({"title": "\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)", "url": "http://magazines.russ.ru/ural/2004/10/mar11.html", "journal": "\u0423\u0440\u0430\u043b", "author": "Margovenko, A", "volume": "10", "year": "2004"}, template_name='cite journal')
self.assertEqual(p['Title'], '\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)')

p = parse_citation_dict({"title": "\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)",
"url": "http://magazines.russ.ru/ural/2004/10/mar11.html",
"journal": "\u0423\u0440\u0430\u043b",
"author": "Margovenko, A",
"volume": "10",
"year": "2004"
},
template_name='cite journal')
self.assertEqual(p['Title'],
'\u0414\u043e\u0440\u043e\u0433\u0438 \u0446\u0430\u0440\u0435\u0439 (Roads of Emperors)')

def test_mwtext(self):
# taken from https://en.wikipedia.org/wiki/Joachim_Lambek
Expand All @@ -40,11 +147,11 @@ def test_mwtext(self):
"""
wikicode = mwparserfromhell.parse(mwtext)
for tpl in wikicode.filter_templates():
parsed = parse_citation_template(tpl)
parsed = parse_citation_template(tpl, 'en')
print parsed
self.assertIsInstance(parsed, dict) # because all templates in this example are citation templates
# All templates in this example are citation templates
self.assertIsInstance(parsed, dict)


if __name__ == '__main__':
unittest.main()