from hmc_urllib import getHTML 

DEPTH = 2 

all_links = [] 

def getURL(): 
    """Asks the user for a URL""" 

    URL = input('Please enter a URL: ') 


    return makeListOfWords(URL), getRecursiveURLs(URL, DEPTH) 

def getRecursiveURLs(url, DEPTH): 
    """Opens up all links and adds them to global all_links list, 
    if they're not in all_links already""" 

    s = getHTML(url) 
    links = s[1] 
    if DEPTH > 0: 
     for i in links: 
      getRecursiveURLs(i, DEPTH - 1) 
      if i not in all_links: 
       #print('This is all_links in the IF', all_links) 
       makeWords(i)#getRecursiveURLs(i, DEPTH - 1) 
      #elif i in all_links: 

      # print('This is all_links in the ELIF', all_links) 
       # makeWords(i) #getRecursiveURLs(i, DEPTH - 1) 
    #print('All_links at the end', all_links) 
    return all_links 

def makeWords(i): 
    """Take all_links and create a dictionary for each page. 
    Then, create a final dictionary of all the words on all pages.""" 

    for i in all_links: 
     FinalDict = makeListOfWords(i) 
    return FinalDict 

def makeListOfWords(URL): 
    """Gets the text from a webpage and puts the words into a list""" 

    text = getHTML(str(URL)) 
    L = text[0].split() 
    return cleaner(L) 

def cleaner(L): 

    """Cleans the text of punctuation and removes words if they are in the stop list.""" 

    stopList = ['', 'a', 'i', 'the', 'and', 'an', 'in', 'with', 'for', 
       'it', 'am', 'at', 'on', 'of', 'to', 'is', 'so', 'too', 
       'my', 'but', 'are', 'very', 'here', 'even', 'from', 
       'them', 'then', 'than', 'this', 'that', 'though'] 

    x = [dePunc(c) for c in L] 

    for c in x: 
     if c in stopList: 

    a = [stemmer(c) for c in x] 

    return counter(a) 

def dePunc(rawword): 
    """ de-punctuationifies the input string """ 

    L = [ c for c in rawword if 'A' <= c <= 'Z' or 'a' <= c <= 'z' ] 
    word = ''.join(L) 
    return word 

def stemmer(word): 

    """Stems the words""" 

    # List of endings 
    endings = ['ed', 'es', 's', 'ly', 'ing', 'er', 'ers'] 

    # This first case handles 3 letter suffixes WITH a doubled consonant. I.E. spammers -> spam 
    if word[len(word)-3:len(word)] in endings and word[-4] == word[-5]: 
     return word[0:len(word)-4] 

    # This case handles 3 letter suffixes WITHOUT a doubled consonant. I.E. players -> play 
    elif word[len(word)-3:len(word)] in endings and word[-4] != word[-5]: 
     return word[0:len(word)-3] 

    # This case handles 2 letter suffixes WITH a doubled consonant. I.E. spammed -> spam 
    elif word[len(word)-2:len(word)] in endings and word[-3] == word[-4]: 
     return word[0:len(word)-3] 

    # This case handles 2 letter suffixes WITHOUT a doubled consonant. I.E. played -> played 
    elif word[len(word)-2:len(word)] in endings and word[-3] != word[-4]: 
     return word[0:len(word)-3] 

    # If word not inflected, return as-is. 
     return word 

def counter(List): 
    """Creates dictionary of words and their frequencies, 'sorts' them, 
    and prints them from most least frequent""" 

    freq = {} 
    result = {} 
# Assign frequency to each word 
    for item in List: 
     freq[item] = freq.get(item,0) + 1 

    # 'Sort' the dictionary by frequency 
    for i in sorted(freq, key=freq.get, reverse=True): 
     if len(result) < MAXWORDS: 
      print(i, '(', freq[i], ')', sep='') 
      result[i] = freq[i] 
    return result 

有很多教程如何抓取網站,例如:http://ms4py.org/2010/4/10/python-search-engine-crawler-part-1/。 – schlamar


你可以使用什麼限制?我建議使用隊列和線程而不是遞歸來抓取。 –


另外,getRecursiveURLS()'究竟有什麼問題? –



這是不完全清楚的分配,但是從我可以收集你正在尋找訪問的所有頁面到深度一次且僅一次的精確要求。另外,您希望從所有頁面中刪除所有單詞並處理彙總結果。下面的代碼片段是你正在尋找的,但它沒有經過測試(我沒有hmc_urllib)。 all_links,makeWordsmakeListOfWords已被刪除,但其餘代碼將相同。

visited_links = [] 

def getURL(): 
    url = input('Please enter a URL: ') 
    word_list = getRecursiveURLs(url, DEPTH) 
    return cleaner(word_list) # this prints the word count for all pages 

def getRecursiveURLs(url, DEPTH): 
    text, links = getHTML(url) 
    returned_word_list = text.split() 
    #cleaner(text.split()) # this prints the word count for the current page 

    if DEPTH > 0: 
     for link in links: 
      if link not in visited_links: 
       returned_word_list += getRecursiveURLs(link, DEPTH - 1) 
    return returned_word_list 


def counter(words): 
    Example Input: ['spam', 'egg', 'egg', 'egg', 'spam', 'spam', 'egg', 'egg'] 
    Example Output: {'spam': 3, 'egg', 5} 
    return dict((word, x.count(word)) for word in set(words)) 

def print_count(word_count, word_max): 
    Example Input: {'spam': 3, 'egg', 5} 
    Prints the word list up to the word_max sorted by frequency 
    for word in sorted(word_count, key=word_count.get, reverse=True)[:word_max]: 
     print(word,'(', word_count[word], ')', sep= '') 

感謝您的回覆!您給我的代碼正確打印每個單詞及其每頁的頻率,但創建的字典僅包含每頁的整個文本。我需要最後一本詞典,將所有頁面中的單詞作爲鍵和頻率作爲值。這可以讓我按照我需要的方式對它們進行分類。現在,它返回如下所示:spam(8)page(1)love(1)。這是第一頁。下一頁是:幹(4)頁(2)這些(2),等等。最終結果必須是垃圾郵件(8)幹(4)頁(3)這些(2)愛(1) –


我不知道爲什麼它不適合你。一旦你到達'return cleaner(global_word_list)','global_word_list'應該包含所有頁面中的所有單詞。我已經多次閱讀您的代碼,並且您沒有明確的理由表明您的行爲。你是否對清潔劑或反制劑做過任何修改? 另外,您不應該使用List作爲計數器的參數名稱。 List是一個Python關鍵字,如果以其他方式使用,它可能會導致意外的行爲。 –


對不起所有的混淆評論。我發現了這個問題。首先,global_word_list + = text.split()行會返回錯誤。它表示在賦值之前引用局部變量。所以我把它改成了global_word_list.append(text.split())。分裂的問題是它創建一個列表。因此,當創建詞典時,它會看到列表,這是每個頁面的文本。我需要弄清楚如何使它成爲一個清單。 –