2014-05-20 61 views
0

我現在有這個文件在我的模型/文件夾:轉換軌任務耙

class Show < ActiveRecord::Base 
    require 'nokogiri' 
    require 'open-uri' 

    has_many :user_shows 
    has_many :users, through: :user_shows 

    def self.update_all_screenings 
    Show.all.each do |show| 
     show.update_attribute(:next_screening, Show.update_next_screening(show.url)) 
    end 
    end 

    def self.update_next_screening(url) 
    nextep = Nokogiri::HTML(open(url)) 
    ## Finds the title of the show and extracts the date of the show and converts to string ## 
    begin 

     title = nextep.at_css('h1').text 
     date = nextep.at_css('.next_episode .highlight_date').text[/\d{1,2}\/\d{1,2}\/\d{4}/] 
     date = date.to_s 

    ## Because if it airs today it won't have a date rather a time this checks whether or not 
    ## there is a date. If there is it will remain, if not it will insert todays date 
    ## plus get the time that the show is airing  
     if date =~ /\d{1,2}\/\d{1,2}\/\d{4}/ 
      showtime = DateTime.strptime(date, "%m/%d/%Y") 
     else 
      date = DateTime.now.strftime("%D") 
      time = nextep.at_css('.next_episode .highlight_date').text[/\dPM|\dAM/] 
      time = time.to_s 
      showtime = date + " " + time 
      showtime = DateTime.strptime(showtime, "%m/%d/%y %l%p") 

     end 

     return showtime 

    rescue 
     return nil 
    end 
    end 
end 

然而,當我運行

Show.update_all_screenings 

它需要年齡的事情。我目前有一個非常相似的腳本,它是一個rake文件,它必須做兩次的抓取量,並在10分鐘內完成,因爲這個過程需要8個小時。所以我想知道如何將這個文件轉換爲rake任務?我正在構建的整個應用程序取決於這能夠在最多1小時內完成。

下面是其他腳本供參考:

require 'mechanize' 

namespace :show do 

    desc "add tv shows from web into database" 
    task :scrape => :environment do 
    puts 'scraping...' 

    Show.delete_all 

agent = Mechanize.new 
agent.get 'http://www.tv.com/shows/sort/a_z/' 
agent.page.search('//div[@class="alphabet"]//li[not(contains(@class, "selected"))]/a').each do |letter_link| 
    agent.get letter_link[:href] 
    letter = letter_link.text.upcase 
    agent.page.search('//li[@class="show"]/a').map do |show_link| 
    Show.create(title: show_link.text, url:'http://tv.com' + show_link[:href].to_s + 'episodes/') 
    end 
    while next_page_link = agent.page.at('//div[@class="_pagination"]//a[@class="next"]') do 
    agent.get next_page_link[:href] 
    agent.page.search('//li[@class="show"]/a').map do |show_link| 
     Show.create(title: show_link.text, url:'http://tv.com' + show_link[:href].to_s + 'episodes/') 
    end 
    end 
end 

end 
end 
+0

嘗試delayed_jobs https://github.com/collectiveidea/delayed_job。它很容易實現,你的方法將在後臺運行 –

+0

嗨@SachinPrasad會加速它,或只是延遲完成這項工作? – HarryLucas

+0

它只是在後臺運行該方法。延遲作業將使該方法異步,因此您不必等待該方法完成。 –

回答

2

Rake是靈丹妙藥 - 它不會運行任何代碼更快。

可能做是更有效地運行你的代碼。代碼中的主要時間消費者迭代地調用open(url)。如果您可以同時閱讀所有網址,整個過程應該花費現在所需時間的一小部分。

您可以使用typhoeus寶石(或其他寶石)來爲您處理。

- 危險! !未經測試的代碼領先 -

我已經使用這個寶石沒有經驗,但是你的代碼可能是這個樣子:

require 'nokogiri' 
require 'open-uri' 
require 'typhoeus' 

class Show < ActiveRecord::Base 


    has_many :user_shows 
    has_many :users, through: :user_shows 

    def self.update_all_screenings 
    hydra = Typhoeus::Hydra.hydra 
    Show.all.each do |show| 
     request = Typhoeus::Request.new(show.url, followlocation: true) 
     request.on_complete do |response| 
     show.update_attribute(:next_screening, Show.update_next_screening(response.body)) 
     end 
     hydra.queue(request) 
    end 
    hydra.run 
    end 

    def self.update_next_screening(body) 
    nextep = Nokogiri::HTML(body) 
    ## Finds the title of the show and extracts the date of the show and converts to string ## 
    begin 

     title = nextep.at_css('h1').text 
     date = nextep.at_css('.next_episode .highlight_date').text[/\d{1,2}\/\d{1,2}\/\d{4}/] 
     date = date.to_s 

    ## Because if it airs today it won't have a date rather a time this checks whether or not 
    ## there is a date. If there is it will remain, if not it will insert todays date 
    ## plus get the time that the show is airing  
     if date =~ /\d{1,2}\/\d{1,2}\/\d{4}/ 
      showtime = DateTime.strptime(date, "%m/%d/%Y") 
     else 
      date = DateTime.now.strftime("%D") 
      time = nextep.at_css('.next_episode .highlight_date').text[/\dPM|\dAM/] 
      time = time.to_s 
      showtime = date + " " + time 
      showtime = DateTime.strptime(showtime, "%m/%d/%y %l%p") 

     end 

     return showtime 

    rescue 
     return nil 
    end 
    end 
end 

上面應該收集的所有要求在一個隊列,並同時運行它們根據任何迴應採取行動。

+0

感謝您投入此時間!我給它一個鏡頭,但似乎在每一行返回: '執行EASY URL = RESPONSE_CODE = 301' – HarryLucas

+0

@HarryLucas - 嘗試'請求=百頭巨怪:: Request.new(show.url,followlocation:真)'(我已更新帖子 - 應遵循重定向) –