serpapi · Willi8910 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
diff --git a/.rspec b/.rspec
@@ -0,0 +1 @@
+--require spec_helper
diff --git a/google_scraper.rb b/google_scraper.rb
@@ -0,0 +1,142 @@
+require 'selenium-webdriver'
+require 'byebug'
+require 'securerandom'
+require 'fileutils'
+require 'json'
+require 'optparse'
+
+class GoogleScraper
+  GOOGLE_HOST = 'https://www.google.com'
+
+  attr_reader :query, :driver, :directory_name
+
+  def initialize(query)
+    @query = query
+    @directory_name = "results/#{query.gsub(/\W/, '')}_results"
+
+    # Setup Selenium WebDriver (Chrome)
+    # setup in way to avoid detection by google
+    options = Selenium::WebDriver::Chrome::Options.new
+    options.add_argument('--disable-blink-features=AutomationControlled')
+    options.add_argument('--disable-infobars')
+    options.add_argument('--start-maximized')
+    options.add_argument('--disable-gpu')
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-dev-shm-usage')
+
+    # Set a real User-Agent
+    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+    options.add_argument("--user-agent=#{user_agent}")
+    # options.add_argument('--headless') # Run Chrome in headless mode
+    @driver = Selenium::WebDriver.for :chrome, options: options
+  end
+
+  def perform
+    begin
+      # Open the target webpage
+      driver.navigate.to "#{GOOGLE_HOST}/search?q=#{query}"
+      random_sleep
+
+      # Wait until the page is fully loaded
+      wait = Selenium::WebDriver::Wait.new(timeout: 10) # Adjust timeout as needed
+      wait.until { driver.execute_script('return document.readyState') == 'complete' }
+
+      # Take a screenshot and save it
+      FileUtils.mkdir_p directory_name
+      screenshot_path = "#{directory_name}/screenshot.png"
+      driver.save_screenshot(screenshot_path)
+
+      page_html = driver.page_source
+      File.open("#{directory_name}/page_source.html", 'w') { |file| file.write(page_html) }
+
+      result = {}
+      result.merge!(scrape_artwork)
+      result.merge!(scrape_images)
+      json_result = JSON.pretty_generate(result)
+      File.open("#{directory_name}/expected_array.json", "w") do |file|
+        file.write(json_result)
+      end
+    ensure
+      driver.quit # Ensure the browser is closed
+    end
+  end
+
+  private
+
+    def scrape_artwork
+      artwork_box_classname = 'Cz5hV'
+      artwork_boxes = driver.find_elements(:class, artwork_box_classname)
+      return {} unless artwork_boxes.any?
+
+      artwork_box = artwork_boxes.first
+      items = artwork_box.find_elements(:tag_name, 'a')
+      item_result = items.map do |item|
+        link = item.attribute('href')
+
+        image_element = item.find_element(:tag_name, 'img')
+        image = image_element.attribute('src')
+
+        name_class_name = 'pgNMRc'
+        name = item.find_element(:class, name_class_name).text
+
+        extentions_class_name = 'cxzHyb'
+        extentions = item.find_element(:class, extentions_class_name).text
+        extentions = extentions.split(' · ')
+        {
+          name: name,
+          extentions: extentions,
+          link: link,
+          image: image
+        }
+      end
+
+      {
+        artwork: item_result
+      }
+    end
+
+    def scrape_images
+      image_box_classname = 'iur'
+      image_boxes = driver.find_elements(:id, image_box_classname)
+      return {} unless image_boxes.any?
+
+      driver.find_element(:class_name, 'jEgXc').click
+
+      image_box = image_boxes.first
+      items = image_box.find_elements(:class_name, 'w43QB')
+      item_result = items.map do |item|
+        link_element = item.find_element(:tag_name, 'a')
+        link = link_element.attribute('href')
+
+        image_element = item.find_element(:class_name, 'gdOPf').find_element(:tag_name, 'img')
+        image = image_element.attribute('src')
+
+        source_box = item.find_element(:class_name, 'VaiWld')
+
+        title_class_name = 'Yt787'
+        title = item.find_element(:class_name, title_class_name).text
+
+        source_icon_element = source_box.find_element(:tag_name, 'img')
+        source_icon = source_icon_element.attribute('src')
+
+        source_site_class_name = 'R8BTeb'
+        source_site = item.find_element(:class_name, source_site_class_name).text
+
+        {
+          title: title,
+          source_icon: source_icon,
+          source_site: source_site,
+          link: link,
+          image: image
+        }
+      end
+
+      {
+        image: item_result
+      }
+    end
+
+    def random_sleep
+      sleep rand(1..2)
+    end
+end
diff --git a/results/CuteCatPhotos_results/expected_array.json b/results/CuteCatPhotos_results/expected_array.json
diff --git a/results/CuteCatPhotos_results/page_source.html b/results/CuteCatPhotos_results/page_source.html
diff --git a/results/CuteCatPhotos_results/screenshot.png b/results/CuteCatPhotos_results/screenshot.png
diff --git a/results/LeonardoDaVinciPaintings_results/expected_array.json b/results/LeonardoDaVinciPaintings_results/expected_array.json
diff --git a/results/LeonardoDaVinciPaintings_results/page_source.html b/results/LeonardoDaVinciPaintings_results/page_source.html
diff --git a/results/LeonardoDaVinciPaintings_results/screenshot.png b/results/LeonardoDaVinciPaintings_results/screenshot.png
diff --git a/results/VanGoghpainting_results/expected_array.json b/results/VanGoghpainting_results/expected_array.json
diff --git a/results/VanGoghpainting_results/page_source.html b/results/VanGoghpainting_results/page_source.html
diff --git a/results/VanGoghpainting_results/screenshot.png b/results/VanGoghpainting_results/screenshot.png
diff --git a/spec/google_scraper_spec.rb b/spec/google_scraper_spec.rb
@@ -0,0 +1,31 @@
+require 'rspec'
+require 'json'
+require_relative '../google_scraper.rb'
+
+describe 'Google Image Scraper' do
+  let(:query) { 'Johannes Vermeer artwork'}
+
+  describe '#scrape' do
+    it 'returns correct result' do
+      GoogleScraper.new(query).perform
+      expected_directory = './results/JohannesVermeerartwork_results'
+
+      # check if file generated
+      expect(Dir.exist?(expected_directory)).to be_truthy
+      expect(File.exist?("#{expected_directory}/expected_array.json")).to be_truthy
+      expect(File.exist?("#{expected_directory}/page_source.html")).to be_truthy
+      expect(File.exist?("#{expected_directory}/screenshot.png")).to be_truthy
+
+      # test first item
+      file_content = File.read("#{expected_directory}/expected_array.json")
+      data = JSON.parse(file_content)
+      item = data['artwork'][0]
+      expect(item["name"]).to eq("Gadis dengan Anting-Anting Mutiara")
+      expect(item["extentions"]).to eq(["1665"])
+      expect(item['link']).not_to be_nil
+      expect(item['image']).not_to be_nil
+
+      FileUtils.rm_rf(expected_directory)
+    end
+  end
+end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -0,0 +1,98 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# The generated `.rspec` file contains `--require spec_helper` which will cause
+# this file to always be loaded, without a need to explicitly require it in any
+# files.
+#
+# Given that it is always loaded, you are encouraged to keep this file as
+# light-weight as possible. Requiring heavyweight dependencies from this file
+# will add to the boot time of your test suite on EVERY test run, even for an
+# individual file that may not need all of that loaded. Instead, consider making
+# a separate helper file that requires the additional dependencies and performs
+# the additional setup, and require it from the spec files that actually need
+# it.
+#
+# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+  # rspec-expectations config goes here. You can use an alternate
+  # assertion/expectation library such as wrong or the stdlib/minitest
+  # assertions if you prefer.
+  config.expect_with :rspec do |expectations|
+    # This option will default to `true` in RSpec 4. It makes the `description`
+    # and `failure_message` of custom matchers include text for helper methods
+    # defined using `chain`, e.g.:
+    #     be_bigger_than(2).and_smaller_than(4).description
+    #     # => "be bigger than 2 and smaller than 4"
+    # ...rather than:
+    #     # => "be bigger than 2"
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+
+  # rspec-mocks config goes here. You can use an alternate test double
+  # library (such as bogus or mocha) by changing the `mock_with` option here.
+  config.mock_with :rspec do |mocks|
+    # Prevents you from mocking or stubbing a method that does not exist on
+    # a real object. This is generally recommended, and will default to
+    # `true` in RSpec 4.
+    mocks.verify_partial_doubles = true
+  end
+
+  # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
+  # have no way to turn it off -- the option exists only for backwards
+  # compatibility in RSpec 3). It causes shared context metadata to be
+  # inherited by the metadata hash of host groups and examples, rather than
+  # triggering implicit auto-inclusion in groups with matching metadata.
+  config.shared_context_metadata_behavior = :apply_to_host_groups
+
+# The settings below are suggested to provide a good initial experience
+# with RSpec, but feel free to customize to your heart's content.
+=begin
+  # This allows you to limit a spec run to individual examples or groups
+  # you care about by tagging them with `:focus` metadata. When nothing
+  # is tagged with `:focus`, all examples get run. RSpec also provides
+  # aliases for `it`, `describe`, and `context` that include `:focus`
+  # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
+  config.filter_run_when_matching :focus
+
+  # Allows RSpec to persist some state between runs in order to support
+  # the `--only-failures` and `--next-failure` CLI options. We recommend
+  # you configure your source control system to ignore this file.
+  config.example_status_persistence_file_path = "spec/examples.txt"
+
+  # Limits the available syntax to the non-monkey patched syntax that is
+  # recommended. For more details, see:
+  # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
+  config.disable_monkey_patching!
+
+  # This setting enables warnings. It's recommended, but in some cases may
+  # be too noisy due to issues in dependencies.
+  config.warnings = true
+
+  # Many RSpec users commonly either run the entire suite or an individual
+  # file, and it's useful to allow more verbose output when running an
+  # individual spec file.
+  if config.files_to_run.one?
+    # Use the documentation formatter for detailed output,
+    # unless a formatter has already been configured
+    # (e.g. via a command-line flag).
+    config.default_formatter = "doc"
+  end
+
+  # Print the 10 slowest examples and example groups at the
+  # end of the spec run, to help surface which specs are running
+  # particularly slow.
+  config.profile_examples = 10
+
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = :random
+
+  # Seed global randomization in this process using the `--seed` CLI option.
+  # Setting this allows you to use `--seed` to deterministically reproduce
+  # test failures related to randomization by passing the same `--seed` value
+  # as the one that triggered the failure.
+  Kernel.srand config.seed
+=end
+end
diff --git a/spec/start_scraper_spec.rb b/spec/start_scraper_spec.rb
@@ -0,0 +1,31 @@
+require 'rspec'
+require_relative '../google_scraper.rb'
+
+RSpec.describe 'GoogleScraper Script' do
+  let(:scraper_instance) { instance_double(GoogleScraper) }
+
+  before do
+    allow(GoogleScraper).to receive(:new).and_return(scraper_instance)
+    allow(scraper_instance).to receive(:perform)
+  end
+
+  context 'when query argument is provided' do
+    it 'creates a GoogleScraper instance and calls perform' do
+      stub_const('ARGV', ['query=hello'])
+
+      # Reload script logic
+      load './start_scraper.rb'
+
+      expect(GoogleScraper).to have_received(:new).with('hello')
+      expect(scraper_instance).to have_received(:perform)
+    end
+  end
+
+  context 'when query argument is missing' do
+    it 'raises an error' do
+      stub_const('ARGV', [])
+
+      expect { load './start_scraper.rb' }.to raise_error(RuntimeError, 'Must have query args')
+    end
+  end
+end
diff --git a/start_scraper.rb b/start_scraper.rb
@@ -0,0 +1,8 @@
+require_relative 'google_scraper'
+
+args = ARGV.map { |arg| arg.split('=', 2) }.to_h
+query = args['query']
+raise("Must have query args") if query.nil?
+
+scraper = GoogleScraper.new(query)
+scraper.perform