我可以使用什麼方法來檢測機器人？

僅僅因爲軟件是自動化的並不意味着它會遵守你的robots.txt。當某人抓取或DDOSing您的網站時，有哪些方法可以檢測到？假設您的網站有100個頁面，並且值得抓取或DDOSing。我可以使用什麼方法來檢測機器人？

這是一個愚蠢的想法我有可能不工作：給每個用戶提供一個獨特的價值一個cookie，並使用cookie時，有人正在第二/第三/等請求，就知道了。這可能不起作用，因爲抓取工具可能不接受cookie，因此在這種方案中，機器人將看起來像是每個請求的新用戶。

有沒有人有更好的想法？

來源

2011-07-22 dan

你可以把你的網頁是不可見的，或者最終用戶可點擊的鏈接。許多機器人只是遵循所有鏈接。一旦有人要求其中一個鏈接，你幾乎肯定會有一個履帶/機器人。

來源

2011-07-22 04:34:26 BrokenGlass

這是國際海事組織的最佳做法。 +1。 – hoodaticus

這很聰明。在識別出來後，如何識別來自同一機器人的請求？ IP地址和標題信息？我想沒有其他事情可以做了.. – dan

雖然在技術上，它不會「檢測」機器人爬蟲，我有一個有趣的方式來阻止他們。我的方法是創建一個IIS過濾器或Apache插件。你會做的是加密所有的HTML，ASP，PHP等...頁面。唯一沒有加密的頁面就是索引頁面。索引頁面只需安裝帶有加密公鑰的cookie，然後重定向到第二個索引頁面。 IIS篩選器或Apache插件會檢查每個訪問者以確保他們擁有這個cookie。如果有，過濾器會解密請求的頁面，然後將頁面傳遞到Web服務器進行處理。

這種方法將允許正常位訪問者查看您的網頁，但如果一個機器人，它拒絕餅乾，試圖讀取網頁，它們都會被加密。

來源

2011-07-22 04:28:21 Icemanind

，如果你的用戶，或他們的公司政策，拒絕餅乾打破規則？ – griegs

如果有人能夠鏈接到網站中的其中一個網頁並直接進入網站而不通過索引？如果他們在網站內部收藏一個頁面，並且在cookie未到期時返回該頁面而不通過索引？ – EdoDodo

@griegs - 那麼公司或用戶只會獲得加密數據。 – Icemanind

項目蜜罐保持「壞」機器人的列表。

這裏有一類我寫了聯繫他們的網絡服務。你必須修改它，因爲我有一些專有的庫，但大多數情況下它應該是好的。有時他們的服務會發回錯誤，但它確實有助於減少一些糟糕的流量。

using System; 
using System.Linq; 
using System.Net; 
using System.Xml.Linq; 
using SeaRisenLib2.Text; 
using XmlLib; 

/// <summary> 
/// Summary description for HoneyPot 
/// </summary> 
public class HoneyPot 
{ 
    private const string KEY = "blacklistkey"; // blacklist key - need to register at httpbl.org to get it 
    private const string HTTPBL = "dnsbl.httpbl.org"; // blacklist lookup host 
public HoneyPot() 
{ 
} 

public static Score GetScore_ByIP(string ip) 
{ 
    string sendMsg = "", receiveMsg = ""; 
    int errorCount = 0; // track where in try/catch we fail for debugging 
    try 
    { 
     // for testing: ip = "188.143.232.31"; 
     //ip = "173.242.116.72"; 
     if ("127.0.0.1" == ip) return null; // localhost development computer 
     IPAddress address; 
     if (!IPAddress.TryParse(ip, out address)) 
      throw new Exception("Invalid IP address to HoneyPot.GetScore_ByIP:" + ip); 
     errorCount++; // 1 
     string reverseIP = ip.ToArray('.').Reverse().ToStringCSV("."); 
     sendMsg = string.Format("{0}.{1}.{2}", KEY, reverseIP, HTTPBL); 
     errorCount++; // 2 
     //IPHostEntry value = Dns.GetHostByName(sendMsg); 
     IPHostEntry value = Dns.GetHostEntry(sendMsg); 
     errorCount++; // 3 
     address = value.AddressList[0]; 
     errorCount++; // 4 
     receiveMsg = address.ToString(); 
     errorCount++; // 5 
     int[] ipArray = receiveMsg.ToArray('.').Select(s => Convert.ToInt32(s)).ToArray(); 
     errorCount++; // 6 
     if (127 != ipArray[0]) // error 
      throw new Exception("HoneyPot error"); 
     errorCount++; // 7 
     Score score = new Score() 
     { 
      DaysSinceLastSeen = ipArray[1], 
      Threat = ipArray[2], 
      BotType = ipArray[3] 
     }; 
     errorCount++; // 8 
     return score; 
    } 
    catch (Exception ex) 
    { 
     Log.Using("VisitorLog/HoneyPotErrors", log => 
     { 
      log.SetString("IPrequest", ip); 
      log.SetString("SendMsg", sendMsg, XmlFile.ELEMENT); 
      log.SetString("RecvMsg", receiveMsg, XmlFile.ELEMENT); 
      log.SetString("Exception", ex.Message, XmlFile.ELEMENT); 
      log.SetString("ErrorCount", errorCount.ToString()); 
     }); 
    } 
    return null; 
} 

// Bitwise values 
public enum BotTypeEnum : int 
{ 
    SearchEngine = 0, 
    Suspicious = 1, 
    Harvester = 2, 
    CommentSpammer = 4 
} 

public class Score 
{ 
    public Score() 
    { 
     BotType = -1; 
     DaysSinceLastSeen = -1; 
     Threat = -1; 
    } 

    public int DaysSinceLastSeen { get; internal set; } 
    public int Threat { get; internal set; } 
    /// <summary> 
    /// Use BotTypeEnum to understand value. 
    /// </summary> 
    public int BotType { get; internal set; } 

    /// <summary> 
    /// Convert HoneyPot Score values to String (DaysSinceLastSeen.Threat.BotType) 
    /// </summary> 
    /// <returns></returns> 
    public override string ToString() 
    { 
     return string.Format("{0}.{1}.{2}", 
      DaysSinceLastSeen, 
      Threat, 
      BotType); 
    } 

    public static explicit operator XElement(Score score) 
    { 
     XElement xpot = new XElement("HoneyPot"); 
     if (null != score) 
     { 
      if (score.DaysSinceLastSeen >= 0) 
       xpot.SetString("Days", score.DaysSinceLastSeen); 
      if (score.Threat >= 0) 
       xpot.SetString("Threat", score.Threat); 
      if (score.BotType >= 0) 
       xpot.SetString("Type", score.BotType); 
      foreach (BotTypeEnum t in Enum.GetValues(typeof(BotTypeEnum))) 
      { 
       // Log enum values as string for each bitwise value represented in score.BotType 
       int value = (int)t; 
       if ((value == score.BotType) || ((value & score.BotType) > 0)) 
        xpot.GetCategory(t.ToString()); 
      } 
     } 
     return xpot; 
    } 

    public static explicit operator Score(XElement xpot) 
    { 
     Score score = null; 
     if (null != xpot) 
      score = new Score() 
      { 
       DaysSinceLastSeen = xpot.GetInt("Days"), 
       Threat = xpot.GetInt("Threat"), 
       BotType = xpot.GetInt("Type") 
      }; 
     return score; 
    } 
} 

/// <summary> 
/// Log score value to HoneyPot child Element (if score not null). 
/// </summary> 
/// <param name="score"></param> 
/// <param name="parent"></param> 
public static void LogScore(HoneyPot.Score score, XElement parent) 
{ 
    if ((null != score) && (null != parent)) 
    { 
     parent.Add((XElement)score); 
    } 
}

}

來源

2011-07-22 04:30:18

黑名單可能不是這樣做的一個很好的方式，這將是最好有被允許做滿一定金額的每秒點擊已知漫遊的白名單。如果某個未在該白名單上的人每秒的點擊次數過多，請開始放棄他們的連接數秒。這將有助於防止ddroc，並讓尚未知的漫遊器掃描您的網站（儘管比您認爲的慢很多）。

您可以保留日誌的罪犯，看看誰被反覆:)

來源

2011-07-22 04:39:33

我可以使用什麼方法來檢測機器人？

回答

相關問題