2014-07-15 80 views
2

找到在HTML頁面中的元素(一個簡單的例子)解析HTML的一個簡單的例子:無法通過其ID

import qualified Data.Text as T 
import Text.HTML.DOM (parseLBS) 
import Text.XML.Cursor (Cursor, attributeIs, content, element, fromDocument, child, ($//), (&|), (&//), (>=>), following) 
import qualified Data.String (fromString) 

findNodes :: Cursor -> [Cursor] 
findNodes = element (Data.String.fromString "div") >=> (Data.String.fromString "id") `attributeIs` (Data.String.fromString "large-user-info") 
    -- >=> 
    -- following >=> element (toXName "div") >=> (toXName "class") `attributeIs` (toT "reputation") >=> 
    -- child >=> child 

cursorFor :: IO Cursor 
cursorFor = do 
    page <- simpleHttp "http://stackoverflow.com/users/2813589/alexander-supertramp" 
    return $ fromDocument $ parseLBS page 

main :: IO() 
main = do 
    cursor <- cursorFor 
    print $ findNodes cursor 

即使如此page返回整個頁面(我檢查了), findNodes返回一個空列表 - 它總是打印[]。我做錯了什麼?

回答

1

我重新編寫了一下你的代碼。你什麼也沒得到,因爲系統不知道如何預覽數據給你。 div中有很多內部元素。

{-# LANGUAGE OverloadedStrings #-} 

module ALSU where 

import Network.HTTP.Conduit (simpleHttp) 
import Text.HTML.DOM 
import Text.XML.Cursor (Cursor, attributeIs, attribute, node, content, element, fromDocument, fromNode, child, 
         ($//), (&|), (&//), (>=>)) 
import qualified Data.Text as T 
import qualified Data.String (fromString) 

---------------------------------------------------------------------------- 

url = "http://stackoverflow.com/users/2813589/alexander-supertramp" 

findNodes :: Cursor -> [Cursor] 
findNodes = element "div" >=> attributeIs "id" "large-user-info" 

-- Extract the data from each node in turn 
-- 
extractData :: Cursor -> T.Text 
extractData cursor = T.concat . content $ cursor 

cursorFor :: String -> IO Cursor 
cursorFor u = do 
    page <- simpleHttp u 
    return $ fromDocument $ parseLBS page 

main :: IO() 
main = do 
    cursor <- cursorFor url 
    let divs = cursor $// findNodes &| extractData 
    putStr $ show $ cursor $//findNodes  
    putStr $ show $ divs 
    putStr "\n" 

使用OverloadedStrings編譯,所以你不需要寫(Data.String.fromString)。如果您運行此代碼,您將看到實際獲取數據,您需要以您需要的方式重寫extractData,具體取決於您想要預覽的內容。

這裏是輸出

[Cursor @ NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-header"),(Name {nameLocalName = "id", nameNamespace = Nothing, namePrefix = Nothing},"large-user-info"),(Name {nameLocalName = "style", nameNamespace = Nothing, namePrefix = Nothing},"")], elementNodes = [NodeContent "\r\n  ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-header-left")], elementNodes = [NodeContent "\r\n   ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"gravatar")], elementNodes = [NodeContent "\r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"http://stackoverflow.com/users/2813589/alexander-supertramp")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "img", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "alt", nameNamespace = Nothing, namePrefix = Nothing},""),(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"logo"),(Name {nameLocalName = "height", nameNamespace = Nothing, namePrefix = Nothing},"128"),(Name {nameLocalName = "src", nameNamespace = Nothing, namePrefix = Nothing},"https://www.gravatar.com/avatar/0e1f310400630c00abfe892c212bfe18?s=128&d=identicon&r=PG&f=1"),(Name {nameLocalName = "width", nameNamespace = Nothing, namePrefix = Nothing},"128")], elementNodes = []})]}),NodeContent "\r\n    "]}),NodeContent "  \r\n    \r\n\r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"reputation")], elementNodes = [NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n       ",NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"https://stackoverflow.com/users/2813589/alexander-supertramp?tab=reputation")], elementNodes = [NodeContent "1,780"]}),NodeContent "\r\n     "]}),NodeContent "\r\n     reputation\r\n    "]}),NodeContent "\r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badges")], elementNodes = [NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"1 gold badge")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge1")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "1"]})]}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"5 silver badges")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge2")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "5"]})]}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"22 bronze badges")], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badge3")], elementNodes = []}),NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"badgecount")], elementNodes = [NodeContent "22"]})]})]}),NodeContent "     \r\n    "]}),NodeContent "\r\n    \r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "id", nameNamespace = Nothing, namePrefix = Nothing},"change-picture-progress")], elementNodes = []}),NodeContent "\r\n   "]}),NodeContent "\r\n\r\n   ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"data")], elementNodes = [NodeContent "\r\n   ",NodeElement (Element {elementName = Name {nameLocalName = "table", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "bio"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "website"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeElement (Element {elementName = Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"url"),(Name {nameLocalName = "href", nameNamespace = Nothing, namePrefix = Nothing},"http://www.gildedhonour.com"),(Name {nameLocalName = "rel", nameNamespace = Nothing, namePrefix = Nothing},"nofollow me")], elementNodes = [NodeContent "gildedhonour.com"]})]}),NodeContent "\r\n     "]}),NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "location"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"label adr")], elementNodes = [NodeContent "roaming in SE Asia"]}),NodeContent "\r\n     "]}),NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "age"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n     "]}),NodeContent "\r\n    "]}),NodeContent "\r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "visits"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "member for"]}),NodeContent "\r\n\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"cool"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2013-09-25 04:05:54Z")], elementNodes = [NodeContent "9 months"]}),NodeContent "\r\n     "]}),NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = []}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "seen"]}),NodeContent "\r\n\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"hot"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2014-07-15 13:15:05Z")], elementNodes = [NodeContent "\r\n       ",NodeElement (Element {elementName = Name {nameLocalName = "span", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"relativetime"),(Name {nameLocalName = "title", nameNamespace = Nothing, namePrefix = Nothing},"2014-07-15 13:15:05Z")], elementNodes = [NodeContent "32 mins ago"]}),NodeContent "\r\n      "]}),NodeContent "\r\n     "]}),NodeContent "\r\n    "]}),NodeContent "\r\n    ",NodeElement (Element {elementName = Name {nameLocalName = "tbody", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-profile-stats")], elementNodes = [NodeContent "\r\n     ",NodeElement (Element {elementName = Name {nameLocalName = "tr", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "th", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "stats"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "profile views"]}),NodeContent "\r\n      ",NodeElement (Element {elementName = Name {nameLocalName = "td", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [], elementNodes = [NodeContent "465"]}),NodeContent "\r\n     "]}),NodeContent "\r\n\r\n        "]}),NodeContent "\r\n   "]}),NodeContent "\r\n   "]}),NodeContent "\r\n\r\n  "]}),NodeContent "\r\n  \r\n  ",NodeElement (Element {elementName = Name {nameLocalName = "div", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"user-about-me note")], elementNodes = [NodeContent "\r\n  "]}),NodeContent "\r\n  ",NodeElement (Element {elementName = Name {nameLocalName = "br", nameNamespace = Nothing, namePrefix = Nothing}, elementAttributes = fromList [(Name {nameLocalName = "class", nameNamespace = Nothing, namePrefix = Nothing},"clear")], elementNodes = []}),NodeContent "\r\n "]})][""] 
+0

'putStr $顯示$ divs'回報( 「」)。 –

+0

也是這個'extractData cursor = T.concat。內容$ cursor'應該是這個'extractData = T.concat。內容「我相信。 –

+0

@AlexanderSupertramp正如我前面提到的,Haskell編譯器不知道如何表示div,查看輸出的內部結構它獲取數據。你需要提供'extractData'中的進一步提取代碼(我的例子),以獲得網站的網址,名稱等。你在這個高級div中有很多東西。你期望在輸出中看到什麼? – Sigrlami