Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: William: Google image scraper #312

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--require spec_helper
142 changes: 142 additions & 0 deletions google_scraper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
require 'selenium-webdriver'
require 'byebug'
require 'securerandom'
require 'fileutils'
require 'json'
require 'optparse'

class GoogleScraper
GOOGLE_HOST = 'https://www.google.com'

attr_reader :query, :driver, :directory_name

def initialize(query)
@query = query
@directory_name = "results/#{query.gsub(/\W/, '')}_results"

# Setup Selenium WebDriver (Chrome)
# setup in way to avoid detection by google
options = Selenium::WebDriver::Chrome::Options.new
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-infobars')
options.add_argument('--start-maximized')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Set a real User-Agent
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
options.add_argument("--user-agent=#{user_agent}")
# options.add_argument('--headless') # Run Chrome in headless mode
@driver = Selenium::WebDriver.for :chrome, options: options
end

def perform
begin
# Open the target webpage
driver.navigate.to "#{GOOGLE_HOST}/search?q=#{query}"
random_sleep

# Wait until the page is fully loaded
wait = Selenium::WebDriver::Wait.new(timeout: 10) # Adjust timeout as needed
wait.until { driver.execute_script('return document.readyState') == 'complete' }

# Take a screenshot and save it
FileUtils.mkdir_p directory_name
screenshot_path = "#{directory_name}/screenshot.png"
driver.save_screenshot(screenshot_path)

page_html = driver.page_source
File.open("#{directory_name}/page_source.html", 'w') { |file| file.write(page_html) }

result = {}
result.merge!(scrape_artwork)
result.merge!(scrape_images)
json_result = JSON.pretty_generate(result)
File.open("#{directory_name}/expected_array.json", "w") do |file|
file.write(json_result)
end
ensure
driver.quit # Ensure the browser is closed
end
end

private

def scrape_artwork
artwork_box_classname = 'Cz5hV'
artwork_boxes = driver.find_elements(:class, artwork_box_classname)
return {} unless artwork_boxes.any?

artwork_box = artwork_boxes.first
items = artwork_box.find_elements(:tag_name, 'a')
item_result = items.map do |item|
link = item.attribute('href')

image_element = item.find_element(:tag_name, 'img')
image = image_element.attribute('src')

name_class_name = 'pgNMRc'
name = item.find_element(:class, name_class_name).text

extentions_class_name = 'cxzHyb'
extentions = item.find_element(:class, extentions_class_name).text
extentions = extentions.split(' · ')
{
name: name,
extentions: extentions,
link: link,
image: image
}
end

{
artwork: item_result
}
end

def scrape_images
image_box_classname = 'iur'
image_boxes = driver.find_elements(:id, image_box_classname)
return {} unless image_boxes.any?

driver.find_element(:class_name, 'jEgXc').click

image_box = image_boxes.first
items = image_box.find_elements(:class_name, 'w43QB')
item_result = items.map do |item|
link_element = item.find_element(:tag_name, 'a')
link = link_element.attribute('href')

image_element = item.find_element(:class_name, 'gdOPf').find_element(:tag_name, 'img')
image = image_element.attribute('src')

source_box = item.find_element(:class_name, 'VaiWld')

title_class_name = 'Yt787'
title = item.find_element(:class_name, title_class_name).text

source_icon_element = source_box.find_element(:tag_name, 'img')
source_icon = source_icon_element.attribute('src')

source_site_class_name = 'R8BTeb'
source_site = item.find_element(:class_name, source_site_class_name).text

{
title: title,
source_icon: source_icon,
source_site: source_site,
link: link,
image: image
}
end

{
image: item_result
}
end

def random_sleep
sleep rand(1..2)
end
end
88 changes: 88 additions & 0 deletions results/CuteCatPhotos_results/expected_array.json

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions results/CuteCatPhotos_results/page_source.html

Large diffs are not rendered by default.

Binary file added results/CuteCatPhotos_results/screenshot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
453 changes: 453 additions & 0 deletions results/LeonardoDaVinciPaintings_results/expected_array.json

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions results/LeonardoDaVinciPaintings_results/page_source.html

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
445 changes: 445 additions & 0 deletions results/VanGoghpainting_results/expected_array.json

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions results/VanGoghpainting_results/page_source.html

Large diffs are not rendered by default.

Binary file added results/VanGoghpainting_results/screenshot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 31 additions & 0 deletions spec/google_scraper_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
require 'rspec'
require 'json'
require_relative '../google_scraper.rb'

describe 'Google Image Scraper' do
let(:query) { 'Johannes Vermeer artwork'}

describe '#scrape' do
it 'returns correct result' do
GoogleScraper.new(query).perform
expected_directory = './results/JohannesVermeerartwork_results'

# check if file generated
expect(Dir.exist?(expected_directory)).to be_truthy
expect(File.exist?("#{expected_directory}/expected_array.json")).to be_truthy
expect(File.exist?("#{expected_directory}/page_source.html")).to be_truthy
expect(File.exist?("#{expected_directory}/screenshot.png")).to be_truthy

# test first item
file_content = File.read("#{expected_directory}/expected_array.json")
data = JSON.parse(file_content)
item = data['artwork'][0]
expect(item["name"]).to eq("Gadis dengan Anting-Anting Mutiara")
expect(item["extentions"]).to eq(["1665"])
expect(item['link']).not_to be_nil
expect(item['image']).not_to be_nil

FileUtils.rm_rf(expected_directory)
end
end
end
98 changes: 98 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# This file was generated by the `rspec --init` command. Conventionally, all
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
# The generated `.rspec` file contains `--require spec_helper` which will cause
# this file to always be loaded, without a need to explicitly require it in any
# files.
#
# Given that it is always loaded, you are encouraged to keep this file as
# light-weight as possible. Requiring heavyweight dependencies from this file
# will add to the boot time of your test suite on EVERY test run, even for an
# individual file that may not need all of that loaded. Instead, consider making
# a separate helper file that requires the additional dependencies and performs
# the additional setup, and require it from the spec files that actually need
# it.
#
# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
RSpec.configure do |config|
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.
config.expect_with :rspec do |expectations|
# This option will default to `true` in RSpec 4. It makes the `description`
# and `failure_message` of custom matchers include text for helper methods
# defined using `chain`, e.g.:
# be_bigger_than(2).and_smaller_than(4).description
# # => "be bigger than 2 and smaller than 4"
# ...rather than:
# # => "be bigger than 2"
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
end

# rspec-mocks config goes here. You can use an alternate test double
# library (such as bogus or mocha) by changing the `mock_with` option here.
config.mock_with :rspec do |mocks|
# Prevents you from mocking or stubbing a method that does not exist on
# a real object. This is generally recommended, and will default to
# `true` in RSpec 4.
mocks.verify_partial_doubles = true
end

# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
# have no way to turn it off -- the option exists only for backwards
# compatibility in RSpec 3). It causes shared context metadata to be
# inherited by the metadata hash of host groups and examples, rather than
# triggering implicit auto-inclusion in groups with matching metadata.
config.shared_context_metadata_behavior = :apply_to_host_groups

# The settings below are suggested to provide a good initial experience
# with RSpec, but feel free to customize to your heart's content.
=begin
# This allows you to limit a spec run to individual examples or groups
# you care about by tagging them with `:focus` metadata. When nothing
# is tagged with `:focus`, all examples get run. RSpec also provides
# aliases for `it`, `describe`, and `context` that include `:focus`
# metadata: `fit`, `fdescribe` and `fcontext`, respectively.
config.filter_run_when_matching :focus

# Allows RSpec to persist some state between runs in order to support
# the `--only-failures` and `--next-failure` CLI options. We recommend
# you configure your source control system to ignore this file.
config.example_status_persistence_file_path = "spec/examples.txt"

# Limits the available syntax to the non-monkey patched syntax that is
# recommended. For more details, see:
# https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
config.disable_monkey_patching!

# This setting enables warnings. It's recommended, but in some cases may
# be too noisy due to issues in dependencies.
config.warnings = true

# Many RSpec users commonly either run the entire suite or an individual
# file, and it's useful to allow more verbose output when running an
# individual spec file.
if config.files_to_run.one?
# Use the documentation formatter for detailed output,
# unless a formatter has already been configured
# (e.g. via a command-line flag).
config.default_formatter = "doc"
end

# Print the 10 slowest examples and example groups at the
# end of the spec run, to help surface which specs are running
# particularly slow.
config.profile_examples = 10

# Run specs in random order to surface order dependencies. If you find an
# order dependency and want to debug it, you can fix the order by providing
# the seed, which is printed after each run.
# --seed 1234
config.order = :random

# Seed global randomization in this process using the `--seed` CLI option.
# Setting this allows you to use `--seed` to deterministically reproduce
# test failures related to randomization by passing the same `--seed` value
# as the one that triggered the failure.
Kernel.srand config.seed
=end
end
31 changes: 31 additions & 0 deletions spec/start_scraper_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
require 'rspec'
require_relative '../google_scraper.rb'

RSpec.describe 'GoogleScraper Script' do
let(:scraper_instance) { instance_double(GoogleScraper) }

before do
allow(GoogleScraper).to receive(:new).and_return(scraper_instance)
allow(scraper_instance).to receive(:perform)
end

context 'when query argument is provided' do
it 'creates a GoogleScraper instance and calls perform' do
stub_const('ARGV', ['query=hello'])

# Reload script logic
load './start_scraper.rb'

expect(GoogleScraper).to have_received(:new).with('hello')
expect(scraper_instance).to have_received(:perform)
end
end

context 'when query argument is missing' do
it 'raises an error' do
stub_const('ARGV', [])

expect { load './start_scraper.rb' }.to raise_error(RuntimeError, 'Must have query args')
end
end
end
8 changes: 8 additions & 0 deletions start_scraper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
require_relative 'google_scraper'

args = ARGV.map { |arg| arg.split('=', 2) }.to_h
query = args['query']
raise("Must have query args") if query.nil?

scraper = GoogleScraper.new(query)
scraper.perform