這是你需要什麼是非常近似的要點是:
class Parser
attr_accessor :pages
def fetch_all(host)
@host = host
fetch(@host)
end
private
def fetch(url)
return if pages.any? { |page| page.url == url }
parse_page(Nokogiri::HTML(open(url).read))
end
def parse_page(document)
links = extract_links(document)
pages << Page.new(
url: url,
title: document.at_css('title').text,
content: document.to_html,
links: links
)
links.each { |link| fetch(@host + link) }
end
def extract_links(document)
document.css('a').map do |link|
href = link['href'].gsub(@host, '')
href if href.start_with?('/')
end.compact.uniq
end
end
class Page
attr_accessor :url, :title, :html_content, :links
def initialize(url:, title:, html_content:, links:)
@url = url
@title = title
@html_content = html_content
@links = links
end
end
你需要的代碼是相當複雜的,你寫的東西像它的1%。您基本上需要遍歷頁面上的所有鏈接,當您獲取,過濾掉外部鏈接並存儲已獲取頁面的數組時,以避免重複調用。 –
您應該搜索堆棧溢出。這條線有很多問題。這裏有一些指針:http://stackoverflow.com/a/4981595/128421 –