anemone with hpricot
by MichaĆ Kuklis on 11/01/2010Anemone is a pretty cool DSL used for web crawling. I used it with Hpricot to get a feeling for what’s possible. Below is a simple example which crawls and scrappes data from a popular polish real estate website otodom:
require 'rubygems'
require 'sanitize'
require 'anemone'
require 'open-uri'
require 'hpricot'
#otodom.pl
Anemone.crawl("http://otodom.pl/index.php?mod=search&act=searchResults&qid=46911208",
{:storage => Anemone::Storage.PStore("crawl1.pstore")}) do | anemone |
# filter out useless pages
anemone.focus_crawl do |page|
page.links.delete_if do |x|
(x.to_s =~ /mod=search&act=searchResults&qid=/).nil? and
(x.to_s =~ /[a-zA-Z]+-id[0-9]*\.html$/).nil?
end
end
# process details pages
anemone.on_pages_like(/[a-zA-Z]+-id[0-9]*\.html$/) do | page |
doc = Hpricot(page.doc)
price = doc.at("//strong[@id='offerPrice']")
location = doc.at("//dl[@class='stripeMe'] > dd")
desc = doc.at("//div[@id='offerDesc'] > p")
offer_no = doc.at("//div[@id='offerFoot'] p[@class='toLeft']/span/strong")
created_at = doc.at("//div[@id='offerFoot'] p[@class='toRight']/span/strong")
photos = doc.search("//div[@id='imageList']/p/a")
end
end
require 'sanitize'
require 'anemone'
require 'open-uri'
require 'hpricot'
#otodom.pl
Anemone.crawl("http://otodom.pl/index.php?mod=search&act=searchResults&qid=46911208",
{:storage => Anemone::Storage.PStore("crawl1.pstore")}) do | anemone |
# filter out useless pages
anemone.focus_crawl do |page|
page.links.delete_if do |x|
(x.to_s =~ /mod=search&act=searchResults&qid=/).nil? and
(x.to_s =~ /[a-zA-Z]+-id[0-9]*\.html$/).nil?
end
end
# process details pages
anemone.on_pages_like(/[a-zA-Z]+-id[0-9]*\.html$/) do | page |
doc = Hpricot(page.doc)
price = doc.at("//strong[@id='offerPrice']")
location = doc.at("//dl[@class='stripeMe'] > dd")
desc = doc.at("//div[@id='offerDesc'] > p")
offer_no = doc.at("//div[@id='offerFoot'] p[@class='toLeft']/span/strong")
created_at = doc.at("//div[@id='offerFoot'] p[@class='toRight']/span/strong")
photos = doc.search("//div[@id='imageList']/p/a")
end
end
There is 1 comment in this article: