Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle and log errors when ingesting bioschemas content #936

Merged
merged 1 commit into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/controllers/bioschemas_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def run_test
if body
begin
ingestor = Ingestors::BioschemasIngestor.new
@output = ingestor.read_content(StringIO.new(body), url: params[:url] || 'https://example.com')
@output = ingestor.read_content(StringIO.new(body), url: params[:url] || Ingestors::BioschemasIngestor::DUMMY_URL).merge(messages: ingestor.messages)
rescue RDF::ReaderError
flash[:error] = 'A parsing error occurred. Please check your document contains valid JSON-LD or HTML.'
format.html { render :test, status: :unprocessable_entity }
Expand Down
20 changes: 15 additions & 5 deletions app/views/bioschemas/_test_results.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,22 @@
<% end %>
</div>
<div class="col-md-8 col-md-pull-4">
<% unless @output[:messages].blank? %>
<h5>Log</h5>
<div class="markdown source-log">
<%= render_markdown(@output[:messages].join("\n\n")) %>
</div>
<% end %>
<h4>Bioschemas summary:</h4>
<table class="table" style="max-width: 20em">
<% @output[:totals].each do |type, total| %>
<tr><td><%= type %></td><td><%= total %></td></tr>
<% end %>
</table>
<% if @output[:totals].values.sum.zero? %>
<span class="muted">Nothing found</span>
<% else %>
<table class="table" style="max-width: 20em">
<% @output[:totals].each do |type, total| %>
<tr><td><%= type %></td><td><%= total %></td></tr>
<% end %>
</table>
<% end %>
</div>
</div>

Expand Down
2 changes: 1 addition & 1 deletion app/views/sources/_test_results.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<% unless test_results[:messages].blank? %>
<h5>Log</h5>
<div class="markdown source-log">
<%= render_markdown(test_results[:messages].join("\n")) %>
<%= render_markdown(test_results[:messages].join("\n\n")) %>
</div>
<% end %>

Expand Down
116 changes: 70 additions & 46 deletions lib/ingestors/bioschemas_ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

module Ingestors
class BioschemasIngestor < Ingestor
DUMMY_URL = 'https://example.com'

attr_reader :verbose

def self.config
Expand All @@ -17,13 +19,14 @@ def read(source_url)
sitemap_regex = nil
@verbose = false
sources = if source_url.downcase.match?(/sitemap(.*)?.xml\Z/)
@messages << "\nParsing sitemap: #{source_url}\n"
sitemap_message = "Parsing sitemap: #{source_url}\n"
urls = SitemapParser.new(source_url, {
recurse: true,
url_regex: sitemap_regex,
headers: { 'User-Agent' => config[:user_agent] }
}).to_a.uniq.map(&:strip)
@messages << " - #{urls.count} URLs found"
recurse: true,
url_regex: sitemap_regex,
headers: { 'User-Agent' => config[:user_agent] }
}).to_a.uniq.map(&:strip)
sitemap_message << "\n - #{urls.count} URLs found"
@messages << sitemap_message
urls
else
[source_url]
Expand All @@ -35,19 +38,21 @@ def read(source_url)
sources.each do |url|
source = open_url(url)
output = read_content(source, url: url)
provider_events += output[:resources][:events]
provider_materials += output[:resources][:materials]
output[:totals].each do |key, value|
totals[key] += value
if output
provider_events += output[:resources][:events]
provider_materials += output[:resources][:materials]
output[:totals].each do |key, value|
totals[key] += value
end
end
end

if totals.keys.any?
@messages << "\nBioschemas summary:\n"
bioschemas_summary = "Bioschemas summary:\n"
totals.each do |type, count|
@messages << " - #{type}: #{count}"
bioschemas_summary << "\n - #{type}: #{count}"
end

@messages << bioschemas_summary
end

deduplicate(provider_events).each do |event_params|
Expand All @@ -65,46 +70,65 @@ def read_content(content, url: nil)
events: [],
materials: []
},
totals: Hash.new(0)
totals: Hash.new(0)
}

return output unless content

sample = content.read(256)&.strip
return output unless sample
begin
sample = content.read(256)&.strip
return output unless sample

format = sample.start_with?('[') || sample.start_with?('{') ? :jsonld : :rdfa
content.rewind
source = content.read
events = Tess::Rdf::EventExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
courses = Tess::Rdf::CourseExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
course_instances = Tess::Rdf::CourseInstanceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
learning_resources = Tess::Rdf::LearningResourceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
output[:totals]['Events'] += events.count
output[:totals]['Courses'] += courses.count
output[:totals]['CourseInstances'] += course_instances.count
output[:totals]['LearningResources'] += learning_resources.count
if verbose
puts "Events: #{events.count}"
puts "Courses: #{courses.count}"
puts "CourseInstances: #{course_instances.count}"
puts "LearningResources: #{learning_resources.count}"
end
format = sample.start_with?('[') || sample.start_with?('{') ? :jsonld : :rdfa
content.rewind
source = content.read
events = Tess::Rdf::EventExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
courses = Tess::Rdf::CourseExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
course_instances = Tess::Rdf::CourseInstanceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
learning_resources = Tess::Rdf::LearningResourceExtractor.new(source, format, base_uri: url).extract do |p|
convert_params(p)
end
output[:totals]['Events'] += events.count
output[:totals]['Courses'] += courses.count
output[:totals]['CourseInstances'] += course_instances.count
output[:totals]['LearningResources'] += learning_resources.count
if verbose
puts "Events: #{events.count}"
puts "Courses: #{courses.count}"
puts "CourseInstances: #{course_instances.count}"
puts "LearningResources: #{learning_resources.count}"
end

deduplicate(events + courses + course_instances).each do |event|
output[:resources][:events] << event
end
deduplicate(events + courses + course_instances).each do |event|
output[:resources][:events] << event
end

deduplicate(learning_resources).each do |material|
output[:resources][:materials] << material
deduplicate(learning_resources).each do |material|
output[:resources][:materials] << material
end
rescue StandardError => e
Rails.logger.error("#{e.class}: #{e.message}")
Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any?
error = 'An error'
comment = nil
if e.is_a?(RDF::ReaderError)
error = 'A parsing error'
comment = 'Please check your page contains valid JSON-LD or HTML.'
end
message = "#{error} occurred while reading"
if url.present? && url != 'https://example.com'
message << ": #{url} "
else
message << " the source"
end
message << ". #{comment}" if comment
@messages << message
end

output
Expand Down
10 changes: 6 additions & 4 deletions test/controllers/bioschemas_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ class BioschemasControllerTest < ActionController::TestCase

post :run_test, params: { snippet: "{ 'oh dear }" }

assert_response :unprocessable_entity
assert flash[:error].include?('parsing error')
assert_response :success
assert_select '.source-log', text:
'A parsing error occurred while reading the source. Please check your page contains valid JSON-LD or HTML.'
ensure
JSON::LD::Reader.define_method(old_method.name, old_method)
end
Expand All @@ -111,8 +112,9 @@ class BioschemasControllerTest < ActionController::TestCase

post :run_test, params: { url: 'https://website.com/material.json' }

assert_response :unprocessable_entity
assert flash[:error].include?('parsing error')
assert_response :success
assert_select '.source-log', text:
'A parsing error occurred while reading: https://website.com/material.json . Please check your page contains valid JSON-LD or HTML.'
ensure
JSON::LD::Reader.define_method(old_method.name, old_method)
end
Expand Down
7 changes: 4 additions & 3 deletions test/unit/ingestors/bioschemas_ingestor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ class BioschemasIngestorTest < ActiveSupport::TestCase
@ingestor.read('https://training.galaxyproject.org/sitemap.xml')
assert_equal 0, @ingestor.events.count
assert_equal 3, @ingestor.materials.count
assert_includes @ingestor.messages, " - 6 URLs found"
assert_includes @ingestor.messages, " - Events: 0"
assert_includes @ingestor.messages, " - LearningResources: 3"
messages = @ingestor.messages.join("\n")
assert_includes messages, "\n - 6 URLs found"
assert_includes messages, "\n - Events: 0"
assert_includes messages, "\n - LearningResources: 3"

assert_difference('Material.count', 3) do
@ingestor.write(@user, @content_provider)
Expand Down
Loading