serpapi · 6in4 · May 15, 2026 · May 16, 2026 · May 16, 2026 · May 19, 2026
diff --git a/.rspec b/.rspec
@@ -0,0 +1,2 @@
+--require spec_helper
+--pattern 'spec/**/*.rb'
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+
+source "https://rubygems.org"
+
+ruby "~> 3.4"
+
+# gem "rails"
+
+gem "rspec"
+gem "nokogiri", "~> 1.19"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,54 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    diff-lcs (1.6.2)
+    nokogiri (1.19.3-aarch64-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.3-aarch64-linux-musl)
+      racc (~> 1.4)
+    nokogiri (1.19.3-arm-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.3-arm-linux-musl)
+      racc (~> 1.4)
+    nokogiri (1.19.3-arm64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.19.3-x86_64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.19.3-x86_64-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.3-x86_64-linux-musl)
+      racc (~> 1.4)
+    racc (1.8.1)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.8)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+
+PLATFORMS
+  aarch64-linux-gnu
+  aarch64-linux-musl
+  arm-linux-gnu
+  arm-linux-musl
+  arm64-darwin
+  x86_64-darwin
+  x86_64-linux-gnu
+  x86_64-linux-musl
+
+DEPENDENCIES
+  nokogiri (~> 1.19)
+  rspec
+
+RUBY VERSION
+   ruby 3.4.9p82
+
+BUNDLED WITH
+   2.6.9
diff --git a/files/claude-monet-paintings.html b/files/claude-monet-paintings.html
diff --git a/files/michelangelo-sculptures.html b/files/michelangelo-sculptures.html
diff --git a/files/pablo-picasso.html b/files/pablo-picasso.html
diff --git a/lib/page.rb b/lib/page.rb
@@ -0,0 +1,87 @@
+require "nokogiri"
+require "json"
+
+class ScraperError < RuntimeError
+  # left intentionally empty; exists for semantic error handling/catching
+end
+
+class Page
+  def initialize(html)
+    @html = html
+    @doc = Nokogiri::HTML(@html)
+    @image_map = get_lazy_load_map
+  end
+
+  # public entrypoint; extend here for additional block types
+  def scrape
+    scrape_carousel
+  end
+
+  # search for images that are lazily loaded
+  # these images are stored in <script> tags
+  # however: not all images are lazily loaded
+  private def get_lazy_load_map
+    # this will break if Google swaps the variable order
+    results = @html.scan(/var s='(data:[a-z]+\/[a-z]+;base64,[^']+)';var ii=\['([^']+)'\]/)
+    results.to_h { | image, id | [ id, image ] }
+  end
+
+  private def scrape_carousel
+    # couldn't find any other carousel type -
+    #   everything else (albums, films, books) use grids instead of carousels
+    #   grids may look functionally identical, but they are semantically different
+    #   + the return key in expected-array.json is { "artworks": [] }
+    # that's why this selector is strict
+    carousel = @doc.css("[data-attrid=\"kc:/visual_art/visual_artist:works\"]")
+
+    # the <a> parent doesn't have any class/id
+    # img is more stable, in that case
+    items = carousel.css("img").map do | img |
+      # precedence: lazy load > data-src > src
+      image = (
+        @image_map[img[:id]] ||
+        img['data-src'] ||
+        img['src']
+      )
+
+      raise ScraperError, 'missing image data - structure changed?' if image.nil?
+      raise ScraperError, 'placeholder gif detected - structure changed?' if image.start_with?("data:image/gif;base64,")
+
+      # a > div > (name_div, year_div)
+      name_div, year_div = img.parent.css("div > div")
+      raise ScraperError, "missing work details" if name_div.nil?
+
+      name = name_div.text.empty? ? img[:alt] : name_div.text
+      raise ScraperError, "missing artwork name" if name.nil? || name.empty?
+      year = year_div&.text || ""
+
+      # this will break if the image parent tag changes
+      # but it works on the example from 2 years ago, 
+      # and it works on the current serp
+      link_el = img.ancestors("a").first
+      raise ScraperError, "missing link element" if link_el.nil? || link_el[:href].to_s.empty?
+      link = "https://www.google.com" + link_el[:href]
+
+      {
+        "name" => name,
+        "extensions" => year.empty? ? nil : [year],
+        "link" => link,
+        # script tags contain `=` base64 padding as `\x3d` instead - unescape only that
+        "image" => image.gsub('\x3d', '='),
+      }.compact
+    end
+
+    { "artworks" => items }
+  end
+end
+
+if $0 == __FILE__ 
+  if ARGV[0].nil?
+    puts "USAGE: #{$0} <serp.html>"
+    exit 1
+  end
+
+  puts JSON.pretty_generate(
+    Page.new(File.read(ARGV[0])).scrape
+  )
+end
diff --git a/spec/page_spec.rb b/spec/page_spec.rb
@@ -0,0 +1,83 @@
+FILES_DIR = File.join(__dir__, "../files")
+
+def scrape(filename)
+  Page.new(File.read(File.join(FILES_DIR, filename))).scrape()
+end
+
+shared_examples "artwork carousel" do
+  it "contains artworks" do
+    expect(@results).to include("artworks")
+    expect(@results["artworks"]).not_to be_empty
+  end
+
+  it "has valid artworks" do
+    @results["artworks"].each do |a|
+      expect(a).to include("name", "link", "image")
+      expect(a["name"]).not_to be_empty
+    end
+  end
+
+  it "has well-formed links" do
+    @results["artworks"].each do |a|
+      # /search? is Google's, not scraper-guaranteed - asserted to catch a structure change
+      expect(a["link"]).to start_with("https://www.google.com/search?")
+    end
+  end
+
+  it "has valid images" do
+    @results["artworks"].each do |a|
+      image = a["image"]
+      expect(image).not_to be_empty
+      # if it's a gif, it's likely a placeholder image - shouldn't be getting those
+      expect(image).not_to start_with("data:image/gif")
+
+      if image.start_with?("data:")
+        # base64 must survive the \x3d unescape; a stray escape leaves a backslash
+        payload = image.split("base64,", 2).fetch(1)
+        expect(payload).to match(%r{\A[A-Za-z0-9+/]+=*\z})
+      else
+        expect(image).to start_with("https://")
+      end
+    end
+  end
+
+  it "has plausible extensions when present" do
+    @results["artworks"].select { |a| a.key?("extensions") }.each do |a|
+      # unanchored: a year may appear in a range ("1508-1512") or be approximate ("c. 1889")
+      expect(a["extensions"].first).to match(/\d{3,4}/)
+    end
+  end
+end
+
+describe "carousel scraper" do
+  describe "van gogh paintings" do
+    # before :all is ok here since we don't mutate any of these variables - they're read only
+    before :all do
+      @results = scrape("van-gogh-paintings.html")
+      @expected = JSON.parse(File.read(File.join(FILES_DIR, "expected-array.json")))
+    end
+
+    include_examples "artwork carousel"
+
+    it "matches expected json" do
+      expect(@results).to eq(@expected)
+    end
+  end
+
+  [
+    ["michelangelo sculptures", "michelangelo-sculptures.html"],
+    ["picasso artwork",         "pablo-picasso.html"],
+    ["claude monet paintings",  "claude-monet-paintings.html"],
+  ].each do |label, file|
+    describe label do
+      before(:all) { @results = scrape(file) }
+      include_examples "artwork carousel"
+    end
+  end
+
+  describe "empty page" do
+    it "returns empty artworks gracefully" do
+      expect(Page.new("<html></html>").scrape()).to eq({ "artworks" => [] })
+    end
+  end
+end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -0,0 +1,101 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# The generated `.rspec` file contains `--require spec_helper` which will cause
+# this file to always be loaded, without a need to explicitly require it in any
+# files.
+#
+# Given that it is always loaded, you are encouraged to keep this file as
+# light-weight as possible. Requiring heavyweight dependencies from this file
+# will add to the boot time of your test suite on EVERY test run, even for an
+# individual file that may not need all of that loaded. Instead, consider making
+# a separate helper file that requires the additional dependencies and performs
+# the additional setup, and require it from the spec files that actually need
+# it.
+#
+# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require 'json'
+require_relative '../lib/page'
+
+RSpec.configure do |config|
+  # rspec-expectations config goes here. You can use an alternate
+  # assertion/expectation library such as wrong or the stdlib/minitest
+  # assertions if you prefer.
+  config.expect_with :rspec do |expectations|
+    # This option will default to `true` in RSpec 4. It makes the `description`
+    # and `failure_message` of custom matchers include text for helper methods
+    # defined using `chain`, e.g.:
+    #     be_bigger_than(2).and_smaller_than(4).description
+    #     # => "be bigger than 2 and smaller than 4"
+    # ...rather than:
+    #     # => "be bigger than 2"
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+
+  # rspec-mocks config goes here. You can use an alternate test double
+  # library (such as bogus or mocha) by changing the `mock_with` option here.
+  config.mock_with :rspec do |mocks|
+    # Prevents you from mocking or stubbing a method that does not exist on
+    # a real object. This is generally recommended, and will default to
+    # `true` in RSpec 4.
+    mocks.verify_partial_doubles = true
+  end
+
+  # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
+  # have no way to turn it off -- the option exists only for backwards
+  # compatibility in RSpec 3). It causes shared context metadata to be
+  # inherited by the metadata hash of host groups and examples, rather than
+  # triggering implicit auto-inclusion in groups with matching metadata.
+  config.shared_context_metadata_behavior = :apply_to_host_groups
+
+# The settings below are suggested to provide a good initial experience
+# with RSpec, but feel free to customize to your heart's content.
+=begin
+  # This allows you to limit a spec run to individual examples or groups
+  # you care about by tagging them with `:focus` metadata. When nothing
+  # is tagged with `:focus`, all examples get run. RSpec also provides
+  # aliases for `it`, `describe`, and `context` that include `:focus`
+  # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
+  config.filter_run_when_matching :focus
+
+  # Allows RSpec to persist some state between runs in order to support
+  # the `--only-failures` and `--next-failure` CLI options. We recommend
+  # you configure your source control system to ignore this file.
+  config.example_status_persistence_file_path = "spec/examples.txt"
+
+  # Limits the available syntax to the non-monkey patched syntax that is
+  # recommended. For more details, see:
+  # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
+  config.disable_monkey_patching!
+
+  # This setting enables warnings. It's recommended, but in some cases may
+  # be too noisy due to issues in dependencies.
+  config.warnings = true
+
+  # Many RSpec users commonly either run the entire suite or an individual
+  # file, and it's useful to allow more verbose output when running an
+  # individual spec file.
+  if config.files_to_run.one?
+    # Use the documentation formatter for detailed output,
+    # unless a formatter has already been configured
+    # (e.g. via a command-line flag).
+    config.default_formatter = "doc"
+  end
+
+  # Print the 10 slowest examples and example groups at the
+  # end of the spec run, to help surface which specs are running
+  # particularly slow.
+  config.profile_examples = 10
+
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = :random
+
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+=end
+end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		--require spec_helper
		--pattern 'spec/*/.rb'