Skip to content

Commit

Permalink
Merge pull request #1032 from DaanVanVugt/feature/vu_materials_scraper
Browse files Browse the repository at this point in the history
vu materials scraper
  • Loading branch information
fbacall authored Oct 21, 2024
2 parents 5d14f23 + 8be2bb4 commit be352f6
Show file tree
Hide file tree
Showing 4 changed files with 11,207 additions and 1 deletion.
3 changes: 2 additions & 1 deletion lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def self.ingestors
Ingestors::RstIngestor,
Ingestors::OsciIngestor,
Ingestors::DccIngestor,
Ingestors::SenseIngestor
Ingestors::SenseIngestor,
Ingestors::VuMaterialIngestor,
] + llm_ingestors
end

Expand Down
71 changes: 71 additions & 0 deletions lib/ingestors/vu_material_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
require 'open-uri'
require 'csv'
require 'nokogiri'

module Ingestors
class VuMaterialIngestor < Ingestor
def self.config
{
key: 'vu_material',
title: 'VU Materials API',
category: :materials
}
end

def read(url)
begin
process_vu(url)
rescue Exception => e
@messages << "#{self.class.name} failed with: #{e.message}"
end

# finished
nil
end

private

def process_vu(url)
headers = {
'Host': 'vu.nl',
'Accept': 'application/json',
'Content-Type': 'application/json',
'api-version': '2020-06-30',
'Content-Length': 396,
'Origin': 'https://vu.nl',
'Referer': 'https://vu.nl/en/education/phd-courses'
}

data = {
"filter": "ItemType/any(c: search.in(c, 'Study', '|')) and ItemType/any(c: search.in(c, 'PhD', '|')) and Language eq 'EN'",
"search": '*',
"skip": 0,
"top": 1000
}

url = URI.parse('https://vu.nl/api/search')
http = Net::HTTP.new(url.host, url.port)
http.use_ssl = (url.scheme == 'https')
request = Net::HTTP::Post.new('https://vu.nl/api/search')
headers.each do |key, value|
request[key] = value
end
request.set_form_data(data)
request.body = data.to_json
request.content_type = 'application/json'
response = http.request(request)
materials_json = JSON.parse(response.body)['value']

# byebug
materials_json.each do |val|
material = OpenStruct.new
material.title = val['Title']
material.url = "https://vu.nl#{val['Url']}"
material.description = val['IntroText']
add_material(material)
rescue Exception => e
@messages << "Extract event fields failed with: #{e.message}"
end
end
end
end
48 changes: 48 additions & 0 deletions test/unit/ingestors/vu_material_ingestor_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
require 'test_helper'

class VuMaterialIngestorTest < ActiveSupport::TestCase
setup do
@user = users(:regular_user)
@content_provider = content_providers(:portal_provider)
mock_ingestions
mock_timezone # System time zone should not affect test result
end

teardown do
reset_timezone
end

test 'can ingest materials from vu' do
source = @content_provider.sources.build(
url: 'https://vu.nl/en/education/phd-courses',
method: 'vu_material',
enabled: true
)

ingestor = Ingestors::VuMaterialIngestor.new

# check materials don't exist
new_title = 'Writing and presenting'
new_url = 'https://vu.nl/en/education/phd-courses/writing-and-presenting'
refute Material.where(title: new_title, url: new_url).any?

# run task
assert_difference('Material.count', 169) do
freeze_time(2019) do
VCR.use_cassette('ingestors/vu_material') do
ingestor.read(source.url)
ingestor.write(@user, @content_provider)
end
end
end

# check event does exist
material = Material.where(title: new_title, url: new_url).first
assert material
assert_equal new_title, material.title
assert_equal new_url, material.url

# check other fields
assert_equal 'In this course students will be trained in two important academic skills: writing, and presenting.', material.description
end
end
Loading

0 comments on commit be352f6

Please sign in to comment.