-
Notifications
You must be signed in to change notification settings - Fork 0
/
inspire.py
136 lines (101 loc) · 4.28 KB
/
inspire.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
'''
inspire-req.py -- InspireHep riq-index calculation
Copyright 2015 Sujeet Akula ([email protected])
Licensed under GNU GPLv2
More info:
* http://github.com/freeboson/inspire-req/
* http://arXiv.org/abs/1209.2124
'''
import urllib2
from urllib import urlencode
from lxml import etree
from StringIO import StringIO
from HTMLParser import HTMLParser
import re, feedparser
import numpy as np
from time import mktime, gmtime
from datetime import datetime
seconds_in_year = 3.15569e7
headers = { 'User-Agent' :
'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'}
base_url = 'http://inspirehep.net/rss'
base_dat = {'ln' : 'en',
'rg' : '500'} # rg is number of entries
# should fix this part
record_filter = re.compile(r'.*record\/([\d]*)[/]{,1}') # id will be in \1
marc = '{http://www.loc.gov/MARC21/slim}'
def fetch_records(search):
records = list()
data = base_dat
data['p'] = search
request = urllib2.Request(base_url, urlencode(data), headers)
xml = urllib2.urlopen(request).read()
feed = feedparser.parse(xml)
for entry in feed.entries:
records.append(record_filter.sub(r'\1', entry.id))
return records
def get_bib_length(inspire_id):
url = u'http://inspirehep.net/record/{}/export/xm'.format(inspire_id)
xml = urllib2.urlopen(urllib2.Request(url)).read()
root = etree.parse(StringIO(xml)).getroot()
record = root.find(marc + 'record').getiterator()
# if you could "jump" to the last record with these tags
# you could get the id of the last reference, which is
# equivalent to the count -- not sure if I can with etree
count = 0
for elem in record:
if elem.get('tag') == '999' and \
elem.get('ind1') == 'C' and \
elem.get('ind2') == '5':
count += 1
return float(count)
def get_num_authors(inspire_id):
url = u'http://inspirehep.net/record/{}/export/xd'.format(inspire_id)
xml = urllib2.urlopen(urllib2.Request(url)).read()
feed = feedparser.parse(xml)
return float(len(feed.feed.get('authors')))
def get_pub_date(inspire_id):
url = u'http://inspirehep.net/record/{}/export/xd'.format(inspire_id)
xml = urllib2.urlopen(urllib2.Request(url)).read()
feed = feedparser.parse(xml)
return datetime.fromtimestamp(mktime(feed.feed.get('updated_parsed')))
def riq_analysis(author="S.Akula.1"):
print("Computing riq index for InspireHep author " + author + ".")
print("Fetching " + author + "'s bibliography...")
papers = fetch_records("author:{0}".format(author))
num_authors = np.array(map(get_num_authors, papers))
print("{} has {} papers on InspireHep.net".format(author,len(papers)))
print("On average, {}'s papers have {} authors".format(author,
np.mean(num_authors)))
print("Fetching citing articles (excl. self-cites)...")
cites = map(lambda recid: fetch_records(
u"refersto:recid:{} -author:{}".format(recid,author)), papers)
print("Found {} citations.".format(sum(map(len,cites))))
print("Finding bibliography length for citing articles...")
total_bib_lengths = list()
cached_lengths = dict()
for cite_list in cites:
bib_lengths = np.zeros(len(cite_list))
for idx in range(len(bib_lengths)):
recid = cite_list[idx]
if recid in cached_lengths:
bib_lengths[idx] = cached_lengths[recid]
else:
bib_lengths[idx] = get_bib_length(recid)
cached_lengths[recid] = bib_lengths[idx]
total_bib_lengths.append(bib_lengths)
# print("Saved {} lookups by caching.".format(sum(map(len,cites))
# -len(cached_lengths)))
print("Computing tori...")
r_inverse = np.array([np.sum(np.reciprocal(x)) for x in total_bib_lengths])
tori = np.dot(r_inverse, np.reciprocal(num_authors))
print("{}'s tori = {}".format(author,tori))
print("Computing riq...")
earliest_pub_date = min(map(get_pub_date, papers))
years_active = (datetime.fromtimestamp(mktime(gmtime())) -
earliest_pub_date).total_seconds()/seconds_in_year
riq = np.sqrt(tori)/years_active
print("{}'s riq-index = {}".format(author,int(np.round(1000*riq))))
if __name__ == "__main__":
riq_analysis()