forked from fabianvf/PLoS-API-consumer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconsumer.py
More file actions
85 lines (71 loc) · 2.8 KB
/
Copy pathconsumer.py
File metadata and controls
85 lines (71 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
__author__ = 'faye'
from scrapi_tools.consumer import BaseConsumer, RawFile, NormalizedFile
import requests
import xmltodict
import json
import time
from datetime import date, timedelta
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import settings
TODAY = str(date.today()) + "T00:00:00Z"
YESTERDAY = str(date.today() - timedelta(4)) + "T00:00:00Z"
MAX_ROWS_PER_REQUEST = 999
class PLoSConsumer(BaseConsumer):
def __init__(self):
# Nothing to see here
pass
def consume(self):
payload = {"api_key": settings.API_KEY, "rows": "0"}
base_url = 'http://api.plos.org/search?q=publication_date:[{}%20TO%20{}]'.format(YESTERDAY, TODAY)
plos_request = requests.get(base_url, params=payload)
response = xmltodict.parse(plos_request.text)
num_articles = int(response["response"]["result"]["@numFound"])
start = 0
rows = MAX_ROWS_PER_REQUEST
doc_list = []
while rows < num_articles + MAX_ROWS_PER_REQUEST:
payload = {"api_key": settings.API_KEY, "rows": rows, "start": start}
results = requests.get(base_url, params=payload)
tick = time.time()
doc = xmltodict.parse(results.text)
full_response = doc["response"]["result"]["doc"]
# TODO Incooporate "Correction" article type
try:
for result in full_response:
try:
if result["arr"][1]["@name"] == "abstract" and result["str"][3]["#text"] == "Research Article":
doc_list.append(RawFile({
'doc': json.dumps(result, indent=4, sort_keys=True),
'source': 'PLoS',
'doc_id': result["str"][0]["#text"],
'filetype': 'json',
}))
except KeyError:
pass
start += MAX_ROWS_PER_REQUEST
rows += MAX_ROWS_PER_REQUEST
if time.time() - tick < 5:
time.sleep(5 - (time.time() - tick))
except KeyError:
print "No new files/updates!"
break
return doc_list
def normalize(self, raw_doc, timestamp):
raw_doc = raw_doc.get('doc')
record = json.loads(raw_doc)
return NormalizedFile({
'title': record["str"][4]["#text"],
'contributors': record["arr"][0]["str"],
'properties': {
'description': record["arr"][1]["str"],
},
'meta': {},
'id': record["str"][0]["#text"],
'source': "PLoS",
'timestamp': timestamp
})
if __name__ == '__main__':
consumer = PLoSConsumer()
print(consumer.lint())