想到放棄一些線條幫助任何人掙扎在那裏通過華廷獲得一個網頁的原始HTML源代碼,但不打補丁華廷 - 就像品味一樣。
因此利用Johan Levin的補丁我把以下內容拼湊在一起。保持安全,並希望你覺得它有用。
private static TextVariant GetWebPageSource(IE browser)
{
IHTMLDocument2 htmlDocument = ((IEDocument)(browser.DomContainer.NativeDocument)).HtmlDocument;
Encoding encoding = Encoding.GetEncoding(htmlDocument.charset);
IPersistStreamInit persistStream = (IPersistStreamInit)htmlDocument;
MinimalIStream stream = new MinimalIStream();
persistStream.Save(stream, false);
return new TextVariant(encoding.GetString(stream.ToArray()));
}
[Guid("7FD52380-4E07-101B-AE2D-08002B2EC713")]
[InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID(out Guid pClassID);
int IsDirty();
void Load(IStream pStm);
void Save(IStream pStm, bool fClearDirty);
void GetSizeMax(out long pcbSize);
void InitNew();
}
// http://stackoverflow.com/questions/6601355/passing-an-fstream-or-equivalent-from-c-to-c-through-cli
[ClassInterface(ClassInterfaceType.AutoDispatch)]
public class MinimalIStream : MemoryStream, IStream
{
public MinimalIStream() { }
public MinimalIStream(byte[] data) : base(data) { }
#region IStream Members
public void Write(byte[] pv, int cb, IntPtr pcbWritten)
{
base.Write(pv, 0, cb);
if (pcbWritten != IntPtr.Zero)
Marshal.WriteInt64(pcbWritten, (long)cb);
}
public void Stat(out STATSTG pstatstg, int grfStatFlag)
{
pstatstg = new STATSTG();
pstatstg.cbSize = base.Length;
}
public void Read(byte[] pv, int cb, IntPtr pcbRead)
{
long bytes_read = base.Read(pv, 0, cb);
if (pcbRead != IntPtr.Zero) Marshal.WriteInt64(pcbRead, bytes_read);
}
public void Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
{
long pos = base.Seek(dlibMove, (SeekOrigin)dwOrigin);
if (plibNewPosition != IntPtr.Zero) Marshal.WriteInt64(plibNewPosition, pos);
}
public void Clone(out IStream ppstm)
{
ppstm = null;
}
public void Commit(int grfCommitFlags)
{
}
public void CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
{
}
public void LockRegion(long libOffset, long cb, int dwLockType)
{
}
public void SetSize(long libNewSize)
{
}
public void Revert()
{
}
public void UnlockRegion(long libOffset, long cb, int dwLockType)
{
}
#endregion
}
謝謝,但是如果我記得正確的話,那麼OuterHtml會返回HTML代碼,因爲它在被IE更改後會返回。我的猜測是,IE解釋HTML並構建DOM。當您訪問OuterHtml IE時,將DOM序列化迴文本格式,這與原始文件略有不同。我想要一份HTML的逐字拷貝,以便我可以對其進行驗證。 – 2011-04-06 13:47:12