-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathebi_scraper.rb
84 lines (65 loc) · 2.73 KB
/
ebi_scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env ruby
require 'open-uri'
require 'nokogiri'
require 'tess_uploader'
$root_url = 'http://www.ebi.ac.uk'
$owner_org = 'european-bioinformatics-institute-ebi'
$lessons = {}
$debug = false
def parse_data(page)
doc = Nokogiri::HTML(open($root_url + page))
#first = doc.css('div.item-list').search('li')
first = doc.css('li.views-row')
first.each do |f|
titles = f.css('div.views-field-title').css('span.field-content').search('a')
desc = f.css('div.views-field-field-course-desc-value').css('div.field-content').search('p')
topics = f.css('div.views-field-tid').css('span.field-content').search('a')
#puts "TITLES: #{titles.css('a')[0]['href']}, #{titles.text}"
#puts "DESC: #{desc.text}"
puts "TOPICS: #{topics.collect{|t| t.text }}"
href = titles.css('a')[0]['href']
$lessons[href] = {}
$lessons[href]['description'] = desc.text.strip
$lessons[href]['text'] = titles.css('a')[0].text
topic_text = topics.collect{|t| t.text }
if !topic_text.empty?
$lessons[href]['topics'] = topic_text.map{|t| {'name' => t.gsub(/[^0-9a-z ]/i, ' ')} } # Replaces extract_keywords
end # Non-alphanumeric purged
end
end
def last_page_number
# This method needs to be updated to find the actual final page.
return 2
end
# Scrape all the pages.
first_page = '/training/online/course-list'
parse_data(first_page)
1.upto(last_page_number) do |num|
page = first_page + '?page=' + num.to_s
puts "Scraping page: #{num.to_s}"
parse_data(page)
end
# Create the organisation.
org_title = 'European Bioinformatics Institute (EBI)'
org_name = 'european-bioinformatics-institute-ebi'
org_desc = 'EMBL-EBI provides freely available data from life science experiments, performs basic research in computational biology and offers an extensive user training programme, supporting researchers in academia and industry.'
org_image_url = 'http://www.ebi.ac.uk/miriam/static/main/img/EBI_logo.png'
homepage = $root_url
node_id = 'embl-ebi'
organisation = Organisation.new(org_title,org_name,org_desc,org_image_url,homepage,node_id)
Uploader.check_create_organisation(organisation)
# Upload all the data.
$lessons.each_key do |key|
course = Tuition::Tutorial.new
course.url = $root_url + key
course.owner_org = $owner_org
course.title = $lessons[key]['text']
course.notes = $lessons[key]['description']
course.set_name($owner_org,$lessons[key]['text'])
course.tags = $lessons[key]['topics']
course.format = 'html'
# Before attempting to create anything we need to check if the resource/dataset already exists, updating it
# as and where necessary.
puts "COURSE: #{course.to_json}"
Uploader.create_or_update(course)
end