-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraping_num_articles.py
46 lines (36 loc) · 1.33 KB
/
scraping_num_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
This script is an example of how to use arcas to collect a number of
metadata of articles on specific topic. In this example the five apis, Ieee,
Plos, Nature, arXiv and Springer and used.
The keywords used to search articles are "sustainable software",
"research software" and we are asking for a maximum number of 30 articles
from each api.
In each search 10 articles are asked.
"""
import arcas
import pandas as pd
keywords = ["sustainable software", "research software"]
num_collect = 10
max_num = 31
dfs = []
for p in [arcas.Nature, arcas.Arxiv, arcas.Ieee, arcas.Plos, arcas.Springer]:
api = p()
for key in keywords:
start = 1
switch = True
while start < max_num and switch is True:
parameters = api.parameters_fix(title=key, records=num_collect,
abstract=key, start=start)
url = api.create_url_search(parameters)
request = api.make_request(url)
root = api.get_root(request)
raw_articles = api.parse(root)
try:
for art in raw_articles:
article = api.to_dataframe(art)
dfs.append(article)
except:
switch = False
start += 10
df = pd.concat(dfs, ignore_index=True)
df.to_csv('../software_data.csv')