diff --git a/.gitignore b/.gitignore index fb7a88e1..7d4d7444 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ build-iPhoneSimulator/ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: .rvmrc .DS_Store + +.byebug_history \ No newline at end of file diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..5255835f --- /dev/null +++ b/.rspec @@ -0,0 +1,3 @@ +--color +--format documentation +--require spec_helper \ No newline at end of file diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..86f08de6 --- /dev/null +++ b/Gemfile @@ -0,0 +1,8 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +# gem "rails" +gem 'nokolexbor', '~> 0.6.0' +gem 'byebug' +gem 'rspec' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..e429c0af --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,36 @@ +GEM + remote: https://rubygems.org/ + specs: + byebug (11.1.3) + diff-lcs (1.6.0) + nokolexbor (0.6.0) + nokolexbor (0.6.0-arm64-darwin) + nokolexbor (0.6.0-x86_64-darwin) + nokolexbor (0.6.0-x86_64-linux) + rspec (3.13.0) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.3) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.3) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.2) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.2) + +PLATFORMS + arm64-darwin + ruby + x86_64-darwin + x86_64-linux + +DEPENDENCIES + byebug + nokolexbor (~> 0.6.0) + rspec + +BUNDLED WITH + 2.6.3 diff --git a/lib/scrapers/generic.rb b/lib/scrapers/generic.rb new file mode 100644 index 00000000..deb4d186 --- /dev/null +++ b/lib/scrapers/generic.rb @@ -0,0 +1,16 @@ +module Scrapers + class Generic + attr_accessor :selector, :processor + + DEFAULT_PROCESSOR_FN = ->(item) { item.text } + + def initialize(selector:, processor: DEFAULT_PROCESSOR_FN) + @selector = selector + @processor = processor + end + + def scrape(html) + @processor.call(html.css(@selector)) + end + end +end \ No newline at end of file diff --git a/lib/scrapers/google/gallery.rb b/lib/scrapers/google/gallery.rb new file mode 100644 index 00000000..a94b5ef7 --- /dev/null +++ b/lib/scrapers/google/gallery.rb @@ -0,0 +1,45 @@ +require_relative 'image' +require_relative 'image_replacer_script' +require_relative '../generic' + +module Scrapers + module Google + class Gallery + DEFAULT_SELECTOR = 'div.iELo6'.freeze + DEFAULT_SCRAPERS = { + name: Scrapers::Generic.new(selector: 'div.pgNMRc'), + extensions: Scrapers::Generic.new(selector: 'div.cxzHyb', processor: ->(div) { div.text.empty? ? nil : [div.text] }), + link: Scrapers::Generic.new(selector: 'a', processor: -> (links) { 'https://www.google.com' + links[0]&.attributes['href']&.text }), + image: Scrapers::Google::Image.new + } + DEFAULT_SCRIPT_SCRAPER = Scrapers::Google::ImageReplacerScript + + def initialize(parser:, selector: DEFAULT_SELECTOR, scrapers: DEFAULT_SCRAPERS, script_scraper: DEFAULT_SCRIPT_SCRAPER) + @parser = parser + @selector = selector + @scrapers = scrapers + @script_scraper = script_scraper.is_a?(Class) ? script_scraper.new : script_scraper + end + + def scrape(input) + html = @parser.HTML(input) + + output = [] + @scrapers[:image].image_map = @script_scraper.scrape(html) + + html.css(@selector).each do |item| + output_item = {} + + @scrapers.each_pair do |key, scraper| + value = scraper.scrape(item) + output_item[key] = value if value + end + + output << output_item + end + + output + end + end + end +end \ No newline at end of file diff --git a/lib/scrapers/google/image.rb b/lib/scrapers/google/image.rb new file mode 100644 index 00000000..e3c0bd29 --- /dev/null +++ b/lib/scrapers/google/image.rb @@ -0,0 +1,33 @@ +module Scrapers + module Google + class Image + DEFAULT_SELECTOR = 'img.taFZJe'.freeze + PLACEHOLDER = ''.freeze + + attr_accessor :selector, :image_map + + def initialize(selector: DEFAULT_SELECTOR) + @selector = selector + end + + def scrape(html) + images = html.css(@selector) + return if images.empty? + image = images[0] + + image_src = image.attributes['src']&.text + image_url = image_src + image_data_src = image.attributes['data-src']&.text + + if image_data_src + image_url = image_data_src + elsif image_src == PLACEHOLDER + image_id = image.attributes['id']&.text + image_url = image_map[image_id] if image_map&.key?(image_id) + end + + image_url + end + end + end +end \ No newline at end of file diff --git a/lib/scrapers/google/image_replacer_script.rb b/lib/scrapers/google/image_replacer_script.rb new file mode 100644 index 00000000..4c9f0be8 --- /dev/null +++ b/lib/scrapers/google/image_replacer_script.rb @@ -0,0 +1,49 @@ +module Scrapers + module Google + class ImageReplacerScript + DEFAULT_SELECTOR = "script".freeze + IMAGE_REPLACER_FN = "_setImagesSrc".freeze + IMAGE_DATA_PATTERN = /s='(.*?)';.*?var ii=\['(.*?)'\]/ + + attr_accessor :selector, :image_replacer_fn + + def initialize(selector: DEFAULT_SELECTOR, image_replacer_fn: IMAGE_REPLACER_FN) + @selector = selector + @image_replacer_fn = image_replacer_fn + end + + def scrape(html) + return {} if html.nil? + + scrape_image_map(scrape_image_replacer_script(html)) + end + + private + + def scrape_image_replacer_script(html) + html.css(@selector) + .select { |script| script.text.include?(@image_replacer_fn) } + .map(&:text) + .join + end + + def scrape_image_map(script) + return {} if script.empty? + + matches = script.scan(IMAGE_DATA_PATTERN) + return {} if matches.empty? + + image_map = {} + matches.each do |base64, image_id| + image_map[image_id] = sanitize(base64) + end + + image_map + end + + def sanitize(base64) + base64.gsub(/\\x3d/, "=") + end + end + end +end \ No newline at end of file diff --git a/scrape.rb b/scrape.rb new file mode 100644 index 00000000..e9156f3d --- /dev/null +++ b/scrape.rb @@ -0,0 +1,25 @@ +require 'nokogiri' +require 'nokolexbor' +require 'json' + +require_relative 'lib/scrapers/google/gallery' + +input_file = ARGV[0] || './files/van-gogh-paintings.html' +parser_name = ARGV[1] || 'output.json' + +case parser_name +when 'nokolexbor' + html_parser = Nokolexbor +when 'nokogiri' + html_parser = Nokogiri +else + html_parser = Nokolexbor +end + +scraper = Scrapers::Google::Gallery.new(parser: html_parser) + +html = File.read(input_file) +artworks = scraper.scrape(html) + +output = {artworks: artworks} +puts output.to_json \ No newline at end of file diff --git a/spec/fixtures/expected-picasso-paintings.json b/spec/fixtures/expected-picasso-paintings.json new file mode 100644 index 00000000..be87d5cc --- /dev/null +++ b/spec/fixtures/expected-picasso-paintings.json @@ -0,0 +1,274 @@ +{ + "artworks": [ + { + "name": "Guernica", + "extensions": ["1937"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Guernica+(Picasso)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBLFMjAxzTLWUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYhdxLU4vyMpMTFTQCgGRxcb4mAC4Q9W5QAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAD", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRstwGXbSA9Fxfl601xrf76CQVHYnpmAu5OUMwGSgALL87wg9HE" + }, + { + "name": "The Old Guitarist", + "extensions": ["1904"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+Old+Guitarist&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBLEsTM0KC7WUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYBUMyUhX8c1IU3EszSxKLgDIA9X8Wz08AAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAF", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT8z6q-0Z3e2iwfVKF8hk-ln1SEjhWyBLYTE5iDXk3ZpmneG0D_" + }, + { + "name": "Girl before a Mirror", + "extensions": ["1932"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Girl+before+a+Mirror&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMCwurCrK1lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWEXcM4tyFJJS0_KLUhUSFXwzi4ryiwDxlblBUwAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAH", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTwVTQy_X2osffv3dXxxf_CrHZQBnT_Qm0DEHrHNTgoVo7z4KPn" + }, + { + "name": "Les Demoiselles d’Avignon", + "extensions": ["1907"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Les+Demoiselles+d%E2%80%99Avignon&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBLFMq7JMcrSUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYpX1SixVcUnPzM4tTc3KA7JRHDTMdyzLT8_LzAMn1z6ZZAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAJ", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRkMWuz31hIU_4X9okeqaXFLFLuGdecGi6Yo5NZPdfL4q6z6hPt" + }, + { + "name": "Dove of Peace", + "extensions": ["1949"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Dove+(Picasso)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzTMybEwiTezKNFSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9i5XPJL0tV0AjITE4sLs7XBACbteK5UAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAL", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTJtIhDtRroGh90Hb6IMsveC6ErkAYFS4_3ihJ6apWrPTtfzw4a" + }, + { + "name": "Le Rêve", + "extensions": ["1932"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Le+R%C3%AAve+(Picasso)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMzEzNTMy1lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWIV8UhWCDq8qS1XQCMhMTiwuztcEAPuh_H1RAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAN", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTXDIrTVvK9sRZr9u-LiB5jFRCuaQWPfpcdKqdkBUViJIrRp1ta" + }, + { + "name": "Portrait of Dora Maar", + "extensions": ["1937"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Portrait+of+Dora+Maar&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyKqw0zzDQUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYhUNyC8qKUrMLFHIT1NwyS9KVPBNTCwCANRroJRVAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAP", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThbFVLCW1Jdprc3AwE4F1WRcsCiD0rj2E4O7DgtqSMf3ppHcu3" + }, + { + "name": "Don Quixote", + "extensions": ["1955"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Don+Quixote+(Picasso)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGSkipKLC21lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWEVd8vMUAkszK_JLUhU0AjKTE4uL8zUB0LmeTVQAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAR", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRxnZZIlUSmERgkSHeoj9GiKp1H9bmTe_ggYHcA_ZZqcBFlC6gW" + }, + { + "name": "The Kiss", + "extensions": ["1925"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+Kiss&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyTCkrKSnQUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYuUIyUhV8M4sLgYANBx8SkgAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAT", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQvZVQdGHKBGKbpi9-BXvySTahVyoFRT3NiGMacxUgQ_qgSFvoJ" + }, + { + "name": "Bull's Head", + "extensions": ["1942"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Bull%27s+Head&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArHykixKcgq1lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWLmdSnNy1IsVPFITUwCwB7t6SgAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAV", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRyeScUWwAGcHyj0wfWuCvGgkrZc_OodFf6KN9_E7N8MjfYu-HE" + }, + { + "name": "Still life with the caned chair", + "extensions": ["1912"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Still+life+with+the+caned+chair&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyNo8vjrfUUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYpUPLsnMyVHIyUxLVSjPLMlQKMlIVUhOzEtNUUjOSMwsAgADu_HYXwAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAX", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQ9C9dkn1JbOAce_NRr-IbH0H_ttSywLANWPqBDbtD4Sm8G9TyG" + }, + { + "name": "Girl on the ball", + "extensions": ["1905"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Girl+on+the+ball&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyNqssMs7TUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYhVwzyzKUcjPUyjJSFVISszJAQAycOWXUAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAZ", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcSDXVgeS4hh8LBzoSZMr8A73MgwVYfvxP6x1sTIDRsWGCq0qBmj" + }, + { + "name": "Self-Portrait", + "extensions": ["1901"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Self-Portrait&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyLC5KMsrVUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYuUNTs1J0w3ILyopSswsAQCx1TH2TQAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAb", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRJaDd_xXpSpHrIJ9UJlE22PeNkWwE37x-v2WDySh8PwUyMZu1o" + }, + { + "name": "Child with a Dove", + "extensions": ["1901"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Child+with+a+Dove&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArFKTI3ic4u0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWAWdMzJzUhTKM0syFBIVXPLLUgEXsnGTUAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAd", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcS-wR2ZNWF-wur0HEJsHtbW1dX9kWSdyZNoABBOm0KOapIRIZoF" + }, + { + "name": "Science and Charity", + "extensions": ["1897"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Science+and+Charity&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBrEMjSxzsouLtJSyk630yzKLSxNz4hOLSpCYmcUlVuX5RdnFi1iFg5MzU_OSUxUS81IUnDMSizJLKgEnplW7UwAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAf", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcSwwDiSjslsExXvEUj9KRJDt4qgSTzlQOpzSkJlV5uYaDTPISch" + }, + { + "name": "Garçon à la pipe", + "extensions": ["1905"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Gar%C3%A7on+%C3%A0+la+pipe&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBLGMqrJM87SUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYhdwTiw4vz89TOLxAISdRoSCzIBUA_dN0QlAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAh", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRjs_E-TtzMRXJBD5ykSm-wPeXVRHeNUklj3fdTZi7o9H1PqEEk" + }, + { + "name": "Two Girls Reading", + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Two+Girls+Reading&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzTMNkrKMIwvz9JSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9iFQwpz1dwzyzKKVYISk1MycxLBwCGtZyEUwAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAj", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRMyFoUvxK5f0YMUT9_tw9k27rTBxs-ATf_UHyQvi-wGgMhiAh3" + }, + { + "name": "The Tragedy", + "extensions": ["1903"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+Tragedy+(Picasso)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyyCjJTivRUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYhUNyUhVCClKTE9NqVTQCMhMTiwuztcEACIAcqhVAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAl", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRRd-pzKzVoO_dluN6gBtdZkMLqxVIt7aLAIkYKS-C_4l38UB7W" + }, + { + "name": "First Communion", + "extensions": ["1896"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=First+Communion&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMNq7MLS8uqtJSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9i5XfLLCouUXDOz80tzcvMzwMAmyIPf1EAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAn", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcQIvN4f5GodeJQG8NitctoEZ3AUxyHrR3HNrdMfskxKakErHCG7" + }, + { + "name": "The Three Dancers", + "extensions": ["1925"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+Three+Dancers&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBLHSjcsyyrWUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYBUMyUhVCMopSUxVcEvOSU4uKATPGHuxPAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAp", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQSJ5CKSr2VkgtsGZQ-SdMcg_YIy_dShuYb-G8WKvm-hGEW5Pdm" + }, + { + "name": "Ma Jolie", + "extensions": ["1912"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Ma+Jolie+(Picasso,+New+York)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArFMzNNyksq0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWGV8ExW88nMyUxU0AjKTE4uL83UU_FLLFSKB0poAsacAnVsAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAr", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSoD0dT6mQwr6rySogsSANW3QRLa9sV5i_CjEFR8pM7BLrOxsvo" + }, + { + "name": "Yellow picador", + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Yellow+picador&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMMksvSCrPNdFSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9i5YtMzcnJL1coyExOTMkvAgCRZ42eUAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAt", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRo4Ksi8JwlmWtYIK2UU9QZKG9e75OeHxekMbH8-QHIsHUf9c4Y" + }, + { + "name": "Woman with flower", + "extensions": ["1932"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Woman+with+flower&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMNi5JK88oKdBSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9iFQzPz03MUyjPLMlQSMvJL08tAgB2tP9PUwAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAv", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ0S3M6GaUBM2IO4bvfM4tmnbPzwAMrvFE14FaU1H7OxEVQmk5e" + }, + { + "name": "Sylvette", + "extensions": ["1954"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Sylvette&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiBLHMKw2NM7SUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYOYIrc8pSS0pSARyYpQVGAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAx", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQr0p3Q32H50eHesLH5lQG-FFvdYYdff0a_D5_5SnS4hvYs4Qrb" + }, + { + "name": "Family of Saltimbanques", + "extensions": ["1905"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Family+of+Saltimbanques&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArEyjPOK88q0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWMXdEnMzcyoV8tMUghNzSjJzkxLzCktTiwHTTWhaVgAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhAz", + "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQAG6FYuThZlmqrccyubJJ6xXcJC7IwL1AxUeVLUP8PL4BnloBE" + }, + { + "name": "Weeping Woman with Handkerchief", + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Weeping+Woman+with+Handkerchief&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMKi_MS0u3LNdSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9ilQ9PTS3IzEtXCM_PTcxTKM8syVDwSMxLyU4tSs7ITE0DAMuwbdVhAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhA1", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcSCfxxad3tarJeywtUZD9S-va4lwLJ7e3hvXl04gC7UrmMumXpG" + }, + { + "name": "Femme à la montre", + "extensions": ["1932"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Femme+%C3%A0+la+montre&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzQsyzIxKSjLztVSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9iFXJLzc1NVTi8QCEnUSE3P6-kKBUA3ba-lFQAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhA3", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSCmXWwM6SRjM0T9HOl0ABYVw7XjQTNNcQdUPTLdnzx4Iz5HN3g" + }, + { + "name": "Seated Woman", + "extensions": ["1927"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Seated+Woman&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMC4sMk9O1lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWHmCUxNLUlMUwvNzE_MAKrNmGUsAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhA5", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSkIAZtGZWhDE6uM0tNN2omEfBmHnT7uVhKGuJJWy60L7g2pn3p" + }, + { + "name": "The Actor", + "extensions": ["1905"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+Actor+(painting)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGSzLKyjHK1lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWEVCMlIVHJNL8osUNAoSM_NKMvPSNQEewTrZUwAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhA7", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcSeHfUr9Ea7ZkxeNe4uEnv-YjVZGa67Ug7MXQh6bKJDBRik1ilH" + }, + { + "name": "Seated Harlequin", + "extensions": ["1901"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Seated+Harlequin&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyyEpLMazSUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYhUITk0sSU1R8EgsykktLM3MAwCq9WOyUAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhA9", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSjapy9KLJo2U7k8ChjEmEtTkGuu3M8vWpuOeSDi7DSemDtmxKP" + }, + { + "name": "Le petit picador jaune", + "extensions": ["1889"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Le+petit+picador+jaune&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArHSsyurLM21lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWMV8UhUKUksySxQKMpMTU_KLFLISS_NSASgfhGNVAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhA_", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTasF87LC2Jno-UFZiMHxSRM1vatwImKhcnDIQE4QKmcQPYlAMv" + }, + { + "name": "Girl with mandolin", + "extensions": ["1910"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Girl+with+mandolin&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMC4uMMoq0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWIXcM4tyFMozSzIUchPzUvJzMvMANurbQFEAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBB", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQHdCUod1LjaB-uYB5pIzN2ZORFZLtpKAejbMg1Ld9Z0fLsPlCd" + }, + { + "name": "Woman Ironing", + "extensions": ["1904"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Woman+Ironing&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMqkgpyS7PNdRSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9i5Q3Pz03MU_Asys_LzEsHAIIWselPAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBD", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcQGn3xxVEeKrl4jxABgYod4eOq6ZIwePogsnaFfy0wfFLBUuUh1" + }, + { + "name": "Celestina", + "extensions": ["1904"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=la+celestina+picasso&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMMkvPtSwzMNdSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9iFclJVEhOzUktLsnMS1QoyExOLC7OBwAyUdfPVgAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBF", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSnIY7Jz0X5ApNwuFxCQZ7t5zxHzJU41Yl5WvlnbpA0Pw-zv4rL" + }, + { + "name": "The Charnel House", + "extensions": ["1945"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+Charnel+House&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMCwsrks20lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWAVDMlIVnDMSi_JScxQ88kuLUwGhRqLAUAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBH", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRUfX9w7mrJUD169hIqwDleu1frHvKlmb_w10Lf8aXcOK7up5in" + }, + { + "name": "Bather", + "extensions": ["1909"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Bather&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMCwuryiq0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWNmcEksyUosAZr7G4kUAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBJ", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTKlbtyfxrH9dY0Z2q1XyFbRUd0ZLzBJ4oC9XMKIZnoinsn5urD" + }, + { + "name": "War and Peace", + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=War+and+Peace&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyKjAvN6nSUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYuUNTyxSSMxLUQhITUxOBQBRE2CMTQAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBL", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSN_W_EJ4Yizec7F6aS_kM71dQjVLbhu5i7GU9vi1WhmUGW_KUN" + }, + { + "name": "On the Beach", + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=On+the+Beach&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyLEnKLizWUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYuXxz1MoyUhVcEpNTM4AAMbnWIJMAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBN", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSKbJfgnUIV7Ny4awo_IslfZzJTvALlor6jgyPhDgSyRSFvsysD" + }, + { + "name": "The serenade", + "extensions": ["1942"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=The+serenade&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyqLQwKanUUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYuUJyUhVKE4tSs1LTEkFAJkcrgpMAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBP", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR0ypB_OXmyj8LnMRvOEJsyeDGF8CL6LHXsjAToX_b8dvgrCVST" + }, + { + "name": "Woman with book", + "extensions": ["1932"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Woman+with+book&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1U_XNzRMNi5JNy6vMNRSyk620i_LLC5NzIlPLCpBYmYWl1iV5xdlFy9i5Q_Pz03MUyjPLMlQSMrPzwYAyjM5BlEAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBR", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRhmMo50Q0GDDOpXP9oqYqtJ8AemZ3lrx_xXi9hw8f91f9yDJyT" + }, + { + "name": "Portrait of Ambroise Vollard", + "extensions": ["1910"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Portrait+of+Ambroise+Vollard&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyqoovqIzXUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYpUJyC8qKUrMLFHIT1NwzE0qys8sTlUIy8_JSSxKAQBpwAhuXAAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBT", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSc5Clp9PtpSl-LJgshboXPGoW8l-5gx-QjHLQZ-YNhQwfyx-vu" + }, + { + "name": "Jacqueline", + "extensions": ["1961"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Jacqueline+(painting)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArGMCooqk1O0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWEW9EpMLS1NzMvNSFTQKEjPzSjLz0jUBqNuyblQAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBV", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSRU39-Q6TW3DgzNJpnKghKZDIEcDd49StP2vrJkq7YlE6P_TkQ" + }, + { + "name": "Tête de Femme", + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=T%C3%AAte+de+Femme&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyMrZITq7SUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYuULObyqJFUhJVXBLTU3NxUARv8x204AAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBX", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRbOQ7Ic4G9i3qLTpsPC80RQ4ctl52HFKaNA6CthiqeNcPigSUG" + }, + { + "name": "Still life (The dessert)", + "extensions": ["1901"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Still+life+(The+dessert)&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQyNi_LS4_XUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYpUILsnMyVHIyUxLVdAIyUhVSEktLk4tKtEEAA_tAmxYAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBZ", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ248LO5IdMOFpipW5UerH5xCHO3nhvnMdtVdX6Sh9ctwKpXbMU" + }, + { + "name": "Minotauromachy", + "extensions": ["1935"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Minotauromachy&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fiArFMzHOKivK0lLKTrfTLMotLE3PiE4tKkJiZxSVW5flF2cWLWPl8M_PySxJLi_JzE5MzKgGuASW-TQAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBb", + "image": "https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTV4t4otcQLqFYTD8zaPvzj6ovc0xLR3cVRUQnIlrAtzgO8K2Qv" + }, + { + "name": "Woman dressed in blue", + "extensions": ["1901"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Woman+dressed+in+blue&stick=H4sIAAAAAAAAAONgFuLQz9U3MDOIN1fi1k_XNzQytshNqijRUspOttIvyywuTcyJTywqQWJmFpdYlecXZRcvYhUNz89NzFNIKUotLk5NUcjMU0jKKU0FAO8ptotVAAAA&sa=X&ved=2ahUKEwjfl83q8oSMAxV1r5UCHQSnPf0Qtq8DegQIBhBd", + "image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcShIW46paqJmCfxH5eR609tgZ1emO04GprcQCma-z6Hma49sVJG" + } + ] +} \ No newline at end of file diff --git a/spec/fixtures/expected-steve-mccurry-photos.json b/spec/fixtures/expected-steve-mccurry-photos.json new file mode 100644 index 00000000..5ecb50c9 --- /dev/null +++ b/spec/fixtures/expected-steve-mccurry-photos.json @@ -0,0 +1,16 @@ +{ + "photos": [ + { + "name": "Afghan Girl", + "extensions": ["1984"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Afghan+Girl&stick=H4sIAAAAAAAAAONgFuLUz9U3MCs0LUhWAjONLMuq4rWUspOt9Msyi0sTc-ITi0qQmJnFJVbl-UXZxYtYuR3T0jMS8xTcM4tyAHCp4R9KAAAA&sa=X&ved=2ahUKEwjwqvDD9oSMAxU3lZUCHaRhKk4Qtq8DegQIBhAD", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTuMonsjG1WMzRhs5-UDE0t2Ww5phvf7GjFSnfF9k9IhvQ78Fza" + }, + { + "name": "Camels under a Blackened Sky", + "extensions": ["1991"], + "link": "https://www.google.com/search?sca_esv=d856a50ff87a9623&cs=0&q=Camels+under+a+Blackened+Sky&stick=H4sIAAAAAAAAAONgFuLUz9U3MCs0LUhW4gIx44uMi0sqtJSyk630yzKLSxNz4hOLSpCYmcUlVuX5RdnFi1hlnBNzU3OKFUrzUlKLFBIVnHISk7NT81JTFIKzKwGElb2WXAAAAA&sa=X&ved=2ahUKEwjwqvDD9oSMAxU3lZUCHaRhKk4Qtq8DegQIBhAF", + "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRP0OFe84RbN4rBF5DtkOwiG-YJdWWIq0aExcwqsemLRfYxRd7S" + } + ] +} diff --git a/spec/fixtures/picasso-paintings.html b/spec/fixtures/picasso-paintings.html new file mode 100644 index 00000000..2898e68c --- /dev/null +++ b/spec/fixtures/picasso-paintings.html @@ -0,0 +1,52 @@ +picasso - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Quick Settings
Advanced Search
About 283,000,000 results (0.35 seconds) 
Pablo Picasso
Spanish painter and sculptor
OverviewArtworksPeriods

Search Results

Page navigation

Google apps
\ No newline at end of file diff --git a/spec/fixtures/steve-mccurry-photos.html b/spec/fixtures/steve-mccurry-photos.html new file mode 100644 index 00000000..935889b1 --- /dev/null +++ b/spec/fixtures/steve-mccurry-photos.html @@ -0,0 +1,74 @@ +Steve McCurry - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Quick Settings
Advanced Search
About 2,210,000 results (0.47 seconds) 
Steve McCurry
American photographer
OverviewArtworksBooksAwards

Search Results

Page navigation

Google apps
\ No newline at end of file diff --git a/spec/lib/scrapers/generic_spec.rb b/spec/lib/scrapers/generic_spec.rb new file mode 100644 index 00000000..ff087229 --- /dev/null +++ b/spec/lib/scrapers/generic_spec.rb @@ -0,0 +1,43 @@ +require 'nokolexbor' +require 'scrapers/generic' + +RSpec.describe Scrapers::Generic do + let(:html) { Nokolexbor::HTML('
Hello World
') } + let(:selector) { '.sample' } + + describe '#initialize' do + it 'creates an instance of the scraper' do + scraper = described_class.new(selector: selector) + expect(scraper.processor).to eq(described_class::DEFAULT_PROCESSOR_FN) + end + + context 'when using a custom processor' do + it 'creates the scraper using the customer processor' do + custom_processor = ->(item) { item.text.upcase } + scraper = described_class.new(selector: selector, processor: custom_processor) + + expect(scraper.selector).to eq(selector) + expect(scraper.processor).to eq(custom_processor) + end + end + end + + describe '#scrape' do + context 'using the default processor' do + it 'scrapes the content using the selector and the default processor' do + scraper = described_class.new(selector: selector) + + expect(scraper.scrape(html)).to eq('Hello World') + end + end + + context 'using a custom processor' do + it 'scrapes the content using the custom processor' do + custom_processor = ->(item) { item.text.upcase } + scraper = described_class.new(selector: selector, processor: custom_processor) + + expect(scraper.scrape(html)).to eq('HELLO WORLD') + end + end + end +end \ No newline at end of file diff --git a/spec/lib/scrapers/google/gallery_spec.rb b/spec/lib/scrapers/google/gallery_spec.rb new file mode 100644 index 00000000..3076d1b7 --- /dev/null +++ b/spec/lib/scrapers/google/gallery_spec.rb @@ -0,0 +1,49 @@ +require 'nokolexbor' +require 'scrapers/google/gallery' +require 'json' + +RSpec.describe Scrapers::Google::Gallery do + let(:parser) { Nokolexbor } + + describe '#scrape' do + context 'Van Gogh artwork' do + let(:scraper) { described_class.new(parser: parser) } + + it 'scrapes the artwork' do + input_file_content = File.read(File.join(__dir__, '../../../../files/van-gogh-paintings.html')) + expected_file_content = File.read(File.join(__dir__, '../../../../files/expected-array.json')) + expected_artworks = JSON.parse(expected_file_content, symbolize_names: true) + + artworks = scraper.scrape(input_file_content) + expect({artworks: artworks}).to eq(expected_artworks) + end + end + + context 'Picasso paintings' do + let(:scraper) { described_class.new(parser: parser) } + + it 'scrapes the paintings' do + input_file_content = File.read(File.join(__dir__, '../../../fixtures/picasso-paintings.html')) + expected_file_content = File.read(File.join(__dir__, '../../../fixtures/expected-picasso-paintings.json')) + expected_artworks = JSON.parse(expected_file_content, symbolize_names: true) + + artworks = scraper.scrape(input_file_content) + + expect({artworks: artworks}).to eq(expected_artworks) + end + end + + context 'Steve McCurry photos' do + let(:scraper) { described_class.new(parser: parser) } + + it 'scrapes the paintings' do + input_file_content = File.read(File.join(__dir__, '../../../fixtures/steve-mccurry-photos.html')) + expected_file_content = File.read(File.join(__dir__, '../../../fixtures/expected-steve-mccurry-photos.json')) + expected_artworks = JSON.parse(expected_file_content, symbolize_names: true) + + photos = scraper.scrape(input_file_content) + expect({photos: photos}).to eq(expected_artworks) + end + end + end +end \ No newline at end of file diff --git a/spec/lib/scrapers/google/image_replacer_script_spec.rb b/spec/lib/scrapers/google/image_replacer_script_spec.rb new file mode 100644 index 00000000..f2166346 --- /dev/null +++ b/spec/lib/scrapers/google/image_replacer_script_spec.rb @@ -0,0 +1,80 @@ +require 'nokolexbor' +require 'scrapers/google/image_replacer_script' + +RSpec.describe Scrapers::Google::ImageReplacerScript do + let(:replacer) { described_class.new } + + describe '#initialize' do + context 'using the default values' do + it 'creates a scraper using the default values' do + expect(replacer.selector).to eq(described_class::DEFAULT_SELECTOR) + expect(replacer.image_replacer_fn).to eq(described_class::IMAGE_REPLACER_FN) + end + end + + context 'using a custom selector and func name' do + it 'creates a scraper using custom values' do + scraper = described_class.new(selector: 'div.sample', image_replacer_fn: 'myFunc') + + expect(scraper.selector).to eq('div.sample') + expect(scraper.image_replacer_fn).to eq('myFunc') + end + end + end + + describe '#scrape' do + context 'when html is nil' do + it 'returns empty hash' do + expect(replacer.scrape(nil)).to eq({}) + end + end + + context 'with valid html' do + let(:html) do + Nokolexbor::HTML(<<~HTML) + + + + + HTML + end + + it 'extracts image mappings' do + expected = { + 'image1' => 'base64data==', + 'image2' => 'otherdata=' + } + expect(replacer.scrape(html)).to eq(expected) + end + end + + context 'with custom function name' do + let(:replacer) { described_class.new(image_replacer_fn: '_customFn') } + let(:html) do + Nokolexbor::HTML(<<~HTML) + + + + HTML + end + + it 'scrapes the correct data' do + expect(replacer.scrape(html)).to eq({'image3' => 'testdata='}) + end + end + + context 'with no matching script' do + let(:html) { Nokolexbor::HTML('') } + + it 'returns empty hash' do + expect(replacer.scrape(html)).to eq({}) + end + end + end +end \ No newline at end of file diff --git a/spec/lib/scrapers/google/image_spec.rb b/spec/lib/scrapers/google/image_spec.rb new file mode 100644 index 00000000..921b70e7 --- /dev/null +++ b/spec/lib/scrapers/google/image_spec.rb @@ -0,0 +1,63 @@ +require 'nokolexbor' +require 'scrapers/google/image' + +RSpec.describe Scrapers::Google::Image do + let(:scraper) { described_class.new } + let(:html) { Nokolexbor::HTML(html_content) } + + describe '#initialize' do + context 'with the default selector' do + it 'creates a scraper using the default selector' do + scraper = described_class.new + + expect(scraper.selector).to eq(described_class::DEFAULT_SELECTOR) + end + end + + context 'with a custom selector' do + it 'creates a scraper using the custom selector' do + scraper = described_class.new(selector: '.custom-class') + + expect(scraper.selector).to eq('.custom-class') + end + end + end + + describe '#scrape' do + context 'when no matching images found' do + let(:html_content) { '
No images here
' } + + it 'returns nil' do + expect(scraper.scrape(html)).to be_nil + end + end + + context 'with regular image src' do + let(:html_content) { '' } + + it 'returns the src url' do + expect(scraper.scrape(html)).to eq('image.jpg') + end + end + + context 'with data-src attribute' do + let(:html_content) { '' } + + it 'returns the data-src url' do + expect(scraper.scrape(html)).to eq('real-image.jpg') + end + end + + context 'with placeholder image' do + let(:html_content) { "" } + + before do + scraper.image_map = {'img1' => 'mapped-image.jpg'} + end + + it 'returns mapped image url' do + expect(scraper.scrape(html)).to eq('mapped-image.jpg') + end + end + end +end \ No newline at end of file diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..26027830 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1 @@ +require 'byebug' \ No newline at end of file