Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--require spec_helper
--pattern 'spec/**/*.rb'
10 changes: 10 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# frozen_string_literal: true

source "https://rubygems.org"

ruby "~> 3.4"

# gem "rails"

gem "rspec"
gem "nokogiri", "~> 1.19"
54 changes: 54 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.6.2)
nokogiri (1.19.3-aarch64-linux-gnu)
racc (~> 1.4)
nokogiri (1.19.3-aarch64-linux-musl)
racc (~> 1.4)
nokogiri (1.19.3-arm-linux-gnu)
racc (~> 1.4)
nokogiri (1.19.3-arm-linux-musl)
racc (~> 1.4)
nokogiri (1.19.3-arm64-darwin)
racc (~> 1.4)
nokogiri (1.19.3-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.19.3-x86_64-linux-gnu)
racc (~> 1.4)
nokogiri (1.19.3-x86_64-linux-musl)
racc (~> 1.4)
racc (1.8.1)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)

PLATFORMS
aarch64-linux-gnu
aarch64-linux-musl
arm-linux-gnu
arm-linux-musl
arm64-darwin
x86_64-darwin
x86_64-linux-gnu
x86_64-linux-musl

DEPENDENCIES
nokogiri (~> 1.19)
rspec

RUBY VERSION
ruby 3.4.9p82

BUNDLED WITH
2.6.9
55 changes: 55 additions & 0 deletions files/claude-monet-paintings.html

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions files/michelangelo-sculptures.html

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions files/pablo-picasso.html

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions lib/page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
require "nokogiri"
require "json"

class ScraperError < RuntimeError
# left intentionally empty; exists for semantic error handling/catching
end

class Page
def initialize(html)
@html = html
@doc = Nokogiri::HTML(@html)
@image_map = get_lazy_load_map
end

# public entrypoint; extend here for additional block types
def scrape
scrape_carousel
end

# search for images that are lazily loaded
# these images are stored in <script> tags
# however: not all images are lazily loaded
private def get_lazy_load_map
# this will break if Google swaps the variable order
results = @html.scan(/var s='(data:[a-z]+\/[a-z]+;base64,[^']+)';var ii=\['([^']+)'\]/)
results.to_h { | image, id | [ id, image ] }
end

private def scrape_carousel
# couldn't find any other carousel type -
# everything else (albums, films, books) use grids instead of carousels
# grids may look functionally identical, but they are semantically different
# + the return key in expected-array.json is { "artworks": [] }
# that's why this selector is strict
carousel = @doc.css("[data-attrid=\"kc:/visual_art/visual_artist:works\"]")

# the <a> parent doesn't have any class/id
# img is more stable, in that case
items = carousel.css("img").map do | img |
# precedence: lazy load > data-src > src
image = (
@image_map[img[:id]] ||
img['data-src'] ||
img['src']
)

raise ScraperError, 'missing image data - structure changed?' if image.nil?
raise ScraperError, 'placeholder gif detected - structure changed?' if image.start_with?("data:image/gif;base64,")

# a > div > (name_div, year_div)
name_div, year_div = img.parent.css("div > div")
raise ScraperError, "missing work details" if name_div.nil?

name = name_div.text.empty? ? img[:alt] : name_div.text
raise ScraperError, "missing artwork name" if name.nil? || name.empty?
year = year_div&.text || ""

# this will break if the image parent tag changes
# but it works on the example from 2 years ago,
# and it works on the current serp
link_el = img.ancestors("a").first
raise ScraperError, "missing link element" if link_el.nil? || link_el[:href].to_s.empty?
link = "https://www.google.com" + link_el[:href]

{
"name" => name,
"extensions" => year.empty? ? nil : [year],
"link" => link,
# script tags contain `=` base64 padding as `\x3d` instead - unescape only that
"image" => image.gsub('\x3d', '='),
}.compact
end

{ "artworks" => items }
end
end

if $0 == __FILE__
if ARGV[0].nil?
puts "USAGE: #{$0} <serp.html>"
exit 1
end

puts JSON.pretty_generate(
Page.new(File.read(ARGV[0])).scrape
)
end
83 changes: 83 additions & 0 deletions spec/page_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
FILES_DIR = File.join(__dir__, "../files")

def scrape(filename)
Page.new(File.read(File.join(FILES_DIR, filename))).scrape()
end

shared_examples "artwork carousel" do
it "contains artworks" do
expect(@results).to include("artworks")
expect(@results["artworks"]).not_to be_empty
end

it "has valid artworks" do
@results["artworks"].each do |a|
expect(a).to include("name", "link", "image")
expect(a["name"]).not_to be_empty
end
end

it "has well-formed links" do
@results["artworks"].each do |a|
# /search? is Google's, not scraper-guaranteed - asserted to catch a structure change
expect(a["link"]).to start_with("https://www.google.com/search?")
end
end

it "has valid images" do
@results["artworks"].each do |a|
image = a["image"]
expect(image).not_to be_empty
# if it's a gif, it's likely a placeholder image - shouldn't be getting those
expect(image).not_to start_with("data:image/gif")

if image.start_with?("data:")
# base64 must survive the \x3d unescape; a stray escape leaves a backslash
payload = image.split("base64,", 2).fetch(1)
expect(payload).to match(%r{\A[A-Za-z0-9+/]+=*\z})
else
expect(image).to start_with("https://")
end
end
end

it "has plausible extensions when present" do
@results["artworks"].select { |a| a.key?("extensions") }.each do |a|
# unanchored: a year may appear in a range ("1508-1512") or be approximate ("c. 1889")
expect(a["extensions"].first).to match(/\d{3,4}/)
end
end
end

describe "carousel scraper" do
describe "van gogh paintings" do
# before :all is ok here since we don't mutate any of these variables - they're read only
before :all do
@results = scrape("van-gogh-paintings.html")
@expected = JSON.parse(File.read(File.join(FILES_DIR, "expected-array.json")))
end

include_examples "artwork carousel"

it "matches expected json" do
expect(@results).to eq(@expected)
end
end

[
["michelangelo sculptures", "michelangelo-sculptures.html"],
["picasso artwork", "pablo-picasso.html"],
["claude monet paintings", "claude-monet-paintings.html"],
].each do |label, file|
describe label do
before(:all) { @results = scrape(file) }
include_examples "artwork carousel"
end
end

describe "empty page" do
it "returns empty artworks gracefully" do
expect(Page.new("<html></html>").scrape()).to eq({ "artworks" => [] })
end
end
end
101 changes: 101 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# This file was generated by the `rspec --init` command. Conventionally, all
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
# The generated `.rspec` file contains `--require spec_helper` which will cause
# this file to always be loaded, without a need to explicitly require it in any
# files.
#
# Given that it is always loaded, you are encouraged to keep this file as
# light-weight as possible. Requiring heavyweight dependencies from this file
# will add to the boot time of your test suite on EVERY test run, even for an
# individual file that may not need all of that loaded. Instead, consider making
# a separate helper file that requires the additional dependencies and performs
# the additional setup, and require it from the spec files that actually need
# it.
#
# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
require 'json'
require_relative '../lib/page'

RSpec.configure do |config|
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.
config.expect_with :rspec do |expectations|
# This option will default to `true` in RSpec 4. It makes the `description`
# and `failure_message` of custom matchers include text for helper methods
# defined using `chain`, e.g.:
# be_bigger_than(2).and_smaller_than(4).description
# # => "be bigger than 2 and smaller than 4"
# ...rather than:
# # => "be bigger than 2"
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
end

# rspec-mocks config goes here. You can use an alternate test double
# library (such as bogus or mocha) by changing the `mock_with` option here.
config.mock_with :rspec do |mocks|
# Prevents you from mocking or stubbing a method that does not exist on
# a real object. This is generally recommended, and will default to
# `true` in RSpec 4.
mocks.verify_partial_doubles = true
end

# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
# have no way to turn it off -- the option exists only for backwards
# compatibility in RSpec 3). It causes shared context metadata to be
# inherited by the metadata hash of host groups and examples, rather than
# triggering implicit auto-inclusion in groups with matching metadata.
config.shared_context_metadata_behavior = :apply_to_host_groups

# The settings below are suggested to provide a good initial experience
# with RSpec, but feel free to customize to your heart's content.
=begin
# This allows you to limit a spec run to individual examples or groups
# you care about by tagging them with `:focus` metadata. When nothing
# is tagged with `:focus`, all examples get run. RSpec also provides
# aliases for `it`, `describe`, and `context` that include `:focus`
# metadata: `fit`, `fdescribe` and `fcontext`, respectively.
config.filter_run_when_matching :focus

# Allows RSpec to persist some state between runs in order to support
# the `--only-failures` and `--next-failure` CLI options. We recommend
# you configure your source control system to ignore this file.
config.example_status_persistence_file_path = "spec/examples.txt"

# Limits the available syntax to the non-monkey patched syntax that is
# recommended. For more details, see:
# https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
config.disable_monkey_patching!

# This setting enables warnings. It's recommended, but in some cases may
# be too noisy due to issues in dependencies.
config.warnings = true

# Many RSpec users commonly either run the entire suite or an individual
# file, and it's useful to allow more verbose output when running an
# individual spec file.
if config.files_to_run.one?
# Use the documentation formatter for detailed output,
# unless a formatter has already been configured
# (e.g. via a command-line flag).
config.default_formatter = "doc"
end

# Print the 10 slowest examples and example groups at the
# end of the spec run, to help surface which specs are running
# particularly slow.
config.profile_examples = 10

# Run specs in random order to surface order dependencies. If you find an
# order dependency and want to debug it, you can fix the order by providing
# the seed, which is printed after each run.
# --seed 1234
config.order = :random

# Seed global randomization in this process using the `--seed` CLI option.
# Setting this allows you to use `--seed` to deterministically reproduce
# test failures related to randomization by passing the same `--seed` value
# as the one that triggered the failure.
Kernel.srand config.seed
=end
end