我目前正在創建一個Node.js webscraper/proxy,但我無法解析在源代碼的腳本部分中找到的相關Url,我想REGEX會做招。 雖然我不知道如何實現這一點。Javascript:REGEX將所有相對網址更改爲絕對
無論如何,我可以去做這件事嗎?
此外,我很容易做到這一點,因爲我很困惑其他代理解析網站。我認爲大多數只是榮耀的網站刮板,它可以讀取網站的源代碼,將所有鏈接/表單重新發送給代理。
我目前正在創建一個Node.js webscraper/proxy,但我無法解析在源代碼的腳本部分中找到的相關Url,我想REGEX會做招。 雖然我不知道如何實現這一點。Javascript:REGEX將所有相對網址更改爲絕對
無論如何,我可以去做這件事嗎?
此外,我很容易做到這一點,因爲我很困惑其他代理解析網站。我認爲大多數只是榮耀的網站刮板,它可以讀取網站的源代碼,將所有鏈接/表單重新發送給代理。
注爲OP,因爲他要求這樣的功能:更改base_url
您代理的基本URL以達到預期的效果。
下面會顯示兩個功能(使用指南包含在代碼中)。確保你不要跳過這個答案的任何部分,以充分理解函數的行爲。
rel_to_abs(urL)
- 此函數返回絕對URL。當傳遞一個具有通用信任協議的絕對URL時,它將立即返回該URL。否則,將從base_url
和函數參數生成一個絕對URL。相對URL被正確解析(../
; ./
; .
; //
)。replace_all_rel_by_abs
- 此功能將解析所有在HTML中具有重要含義的URL,例如CSS url()
,鏈接和外部資源。查看代碼以獲得解析實例的完整列表。請參閱this answer調整實施至從外部源(嵌入文檔)清理HTML字符串。rel_to_abs
- 解析相對URL
function rel_to_abs(url){
/* Only accept commonly trusted protocols:
* Only data-image URLs are accepted, Exotic flavours (escaped slash,
* html-entitied characters) are not supported to keep the function fast */
if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url))
return url; //Url is already absolute
var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/";
if(url.substring(0,2) == "//")
return location.protocol + url;
else if(url.charAt(0) == "/")
return location.protocol + "//" + location.host + url;
else if(url.substring(0,2) == "./")
url = "." + url;
else if(/^\s*$/.test(url))
return ""; //Empty = Return nothing
else url = "../" + url;
url = base_url + url;
var i=0
while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,"")));
/* Escape certain characters to prevent XSS */
url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22")
.replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E");
return url;
}
例/例子:
http://foo.bar
。已經是絕對URL,因此立即返回。/doo
相對於根目錄:返回當前根目錄+提供的相對URL。./meh
相對於當前目錄。../booh
相對於父目錄。功能相對路徑轉換爲../
,並執行搜索和替換(http://domain/sub/anything-but-a-slash/../me
到http://domain/sub/me
)。
replace_all_rel_by_abs
- 轉換網址,所有相關的出現次數的腳本實例(<script>
內
這個腳本里面有一些註釋正則表達式是動態創建的,因爲單個RE的大小可以是 cha racters。 <meta http-equiv=refresh content=.. >
可以以各種方式混淆,因此RE的大小。
function replace_all_rel_by_abs(html){
/*HTML/XML Attribute may not be prefixed by these characters (common
attribute chars. This list is not complete, but will be sufficient
for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */
var att = "[^-a-z0-9:._]";
var entityEnd = "(?:;|(?!\\d))";
var ents = {" ":"(?:\\s| ?|�*32"+entityEnd+"|�*20"+entityEnd+")",
"(":"(?:\\(|�*40"+entityEnd+"|�*28"+entityEnd+")",
")":"(?:\\)|�*41"+entityEnd+"|�*29"+entityEnd+")",
".":"(?:\\.|�*46"+entityEnd+"|�*2e"+entityEnd+")"};
/* Placeholders to filter obfuscations */
var charMap = {};
var s = ents[" "]+"*"; //Short-hand for common use
var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*";
/*^Important: Must be pre- and postfixed by <and>.
* This RE should match anything within a tag! */
/*
@name ae
@description Converts a given string in a sequence of the original
input and the HTML entity
@param String string String to convert
*/
function ae(string){
var all_chars_lowercase = string.toLowerCase();
if(ents[string]) return ents[string];
var all_chars_uppercase = string.toUpperCase();
var RE_res = "";
for(var i=0; i<string.length; i++){
var char_lowercase = all_chars_lowercase.charAt(i);
if(charMap[char_lowercase]){
RE_res += charMap[char_lowercase];
continue;
}
var char_uppercase = all_chars_uppercase.charAt(i);
var RE_sub = [char_lowercase];
RE_sub.push("�*" + char_lowercase.charCodeAt(0) + entityEnd);
RE_sub.push("�*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
if(char_lowercase != char_uppercase){
/* Note: RE ignorecase flag has already been activated */
RE_sub.push("�*" + char_uppercase.charCodeAt(0) + entityEnd);
RE_sub.push("�*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
}
RE_sub = "(?:" + RE_sub.join("|") + ")";
RE_res += (charMap[char_lowercase] = RE_sub);
}
return(ents[string] = RE_res);
}
/*
@name by
@description 2nd argument for replace().
*/
function by(match, group1, group2, group3){
/* Note that this function can also be used to remove links:
* return group1 + "javascript://" + group3; */
return group1 + rel_to_abs(group2) + group3;
}
/*
@name by2
@description 2nd argument for replace(). Parses relevant HTML entities
*/
var slashRE = new RegExp(ae("/"), 'g');
var dotRE = new RegExp(ae("."), 'g');
function by2(match, group1, group2, group3){
/*Note that this function can also be used to remove links:
* return group1 + "javascript://" + group3; */
group2 = group2.replace(slashRE, "/").replace(dotRE, ".");
return group1 + rel_to_abs(group2) + group3;
}
/*
@name cr
@description Selects a HTML element and performs a
search-and-replace on attributes
@param String selector HTML substring to match
@param String attribute RegExp-escaped; HTML element attribute to match
@param String marker Optional RegExp-escaped; marks the prefix
@param String delimiter Optional RegExp escaped; non-quote delimiters
@param String end Optional RegExp-escaped; forces the match to end
before an occurence of <end>
*/
function cr(selector, attribute, marker, delimiter, end){
if(typeof selector == "string") selector = new RegExp(selector, "gi");
attribute = att + attribute;
marker = typeof marker == "string" ? marker : "\\s*=\\s*";
delimiter = typeof delimiter == "string" ? delimiter : "";
end = typeof end == "string" ? "?)("+end : ")(";
var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')', 'gi');
var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")", 'gi');
var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')', 'gi');
html = html.replace(selector, function(match){
return match.replace(re1, by).replace(re2, by).replace(re3, by);
});
}
/*
@name cri
@description Selects an attribute of a HTML element, and
performs a search-and-replace on certain values
@param String selector HTML element to match
@param String attribute RegExp-escaped; HTML element attribute to match
@param String front RegExp-escaped; attribute value, prefix to match
@param String flags Optional RegExp flags, default "gi"
@param String delimiter Optional RegExp-escaped; non-quote delimiters
@param String end Optional RegExp-escaped; forces the match to end
before an occurence of <end>
*/
function cri(selector, attribute, front, flags, delimiter, end){
if(typeof selector == "string") selector = new RegExp(selector, "gi");
attribute = att + attribute;
flags = typeof flags == "string" ? flags : "gi";
var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)', 'gi');
var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)", 'gi');
var at1 = new RegExp('('+front+')([^"]+)(")', flags);
var at2 = new RegExp("("+front+")([^']+)(')", flags);
if(typeof delimiter == "string"){
end = typeof end == "string" ? end : "";
var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"), flags);
var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2).replace(at3, by2)};
} else {
var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2)};
}
html = html.replace(selector, function(match){
return match.replace(re1, handleAttr).replace(re2, handleAttr);
});
}
/* <meta http-equiv=refresh content=" ; url= " > */
cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))", "content", ae("url")+s+ae("=")+s, "i");
cr("<"+any+att+"href\\s*="+any+">", "href"); /* Linked elements */
cr("<"+any+att+"src\\s*="+any+">", "src"); /* Embedded elements */
cr("<object"+any+att+"data\\s*="+any+">", "data"); /* <object data= > */
cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); /* <applet codebase= > */
/* <param name=movie value= >*/
cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))", "value");
cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); /* <style> */
cri("<"+any+att+"style\\s*="+any+">", "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /*< style=" url(...) " > */
return html;
}
的私人活動的簡短摘要:
rel_to_abs(url)
- 相對/未知的URL轉換爲絕對URLreplace_all_rel_by_abs(html)
- 替換由絕對URL的HTML的字符串中的URL的所有相關OCCURENCES 。
ae
- 一個紐約ë ntity - 返回RE模式來處理HTML實體。by
- 用替換- 這個簡短的函數請求實際的URL替換(rel_to_abs
)。如果不是千次,這個函數可能被稱爲數百個。請小心不要將慢速算法添加到此功能(自定義)。cr
- C reate R eplace - 創建並執行搜索和替換。 href="..."
(在任何HTML標籤內)。cri
- Ç reate ř E放置我 n第 - 創建並執行一個搜索和替換。 url(..)
屬於HTML標記內的所有style
屬性。打開任何頁面,並粘貼在地址欄中以下書籤:
javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})();
注入的代碼包含了兩個函數,如上所定義的,加之測試用例,如下所示。 注意:測試用例確實不是修改頁面的HTML,但在textarea(可選)中顯示解析的結果。
var t=(new Date).getTime();
var result = replace_all_rel_by_abs(document.documentElement.innerHTML);
if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){
var txt = document.createElement("textarea");
txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%"
txt.ondblclick = function(){this.parentNode.removeChild(this)}
txt.value = result;
document.body.appendChild(txt);
}
參見:
如果您使用正則表達式來查找所有非絕對URL,則只需在它們前面加上當前URL即可。
你需要修復將是那些不與任何一個/
或http(s)://
開始(或其他協議標記,如果你關心他們)
舉個例子,假設您正在刮網址http://www.example.com/
。如果你遇到一個相對URL,讓我們說foo/bar
,只需將前綴的網址被刮它,像這樣:http://www.example.com/foo/bar
對於一個正則表達式從頁面刮掉的網址,有可能是很多好的可用的,如果你谷歌了一下,所以我不打算開始發明了這裏一窮:)
到URL從相對轉換爲絕對可靠的方法是使用內置的url
module。
例子:
var url = require('url');
url.resolve("http://www.example.org/foo/bar/", "../baz/qux.html");
>> gives 'http://www.example.org/foo/baz/qux.html'
「require」來自哪裏? –
@ajkochanowicz:問題是關於一個Node.js應用程序。 'require()'是C'#include <...>'的Node.js等價物。 (嗯,不完全是。)所以,在編寫JS代碼在瀏覽器中運行時,我的答案是無法使用的。 – tuomassalo
這是在當前線程加上我一些代碼重新分解,使JSLint的快樂Rob W answer "Advanced HTML string replacement functions"。
我應該發佈它作爲答案的評論,但我沒有足夠的聲望點。
/*jslint browser: true */
/*jslint regexp: true */
/*jslint unparam: true*/
/*jshint strict: false */
/**
* convertRelToAbsUrl
*
* https://stackoverflow.com/a/7544757/1983903
*
* @param {String} url
* @return {String} updated url
*/
function convertRelToAbsUrl(url) {
var baseUrl = null;
if (/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) {
return url; // url is already absolute
}
baseUrl = location.href.match(/^(.+)\/?(?:#.+)?$/)[0] + '/';
if (url.substring(0, 2) === '//') {
return location.protocol + url;
}
if (url.charAt(0) === '/') {
return location.protocol + '//' + location.host + url;
}
if (url.substring(0, 2) === './') {
url = '.' + url;
} else if (/^\s*$/.test(url)) {
return ''; // empty = return nothing
}
url = baseUrl + '../' + url;
while (/\/\.\.\//.test(url)) {
url = url.replace(/[^\/]+\/+\.\.\//g, '');
}
url = url.replace(/\.$/, '').replace(/\/\./g, '').replace(/"/g, '%22')
.replace(/'/g, '%27').replace(/</g, '%3C').replace(/>/g, '%3E');
return url;
}
/**
* convertAllRelativeToAbsoluteUrls
*
* https://stackoverflow.com/a/7544757/1983903
*
* @param {String} html
* @return {String} updated html
*/
function convertAllRelativeToAbsoluteUrls(html) {
var me = this,
att = '[^-a-z0-9:._]',
entityEnd = '(?:;|(?!\\d))',
ents = {
' ' : '(?:\\s| ?|�*32' + entityEnd + '|�*20' + entityEnd + ')',
'(' : '(?:\\(|�*40' + entityEnd + '|�*28' + entityEnd + ')',
')' : '(?:\\)|�*41' + entityEnd + '|�*29' + entityEnd + ')',
'.' : '(?:\\.|�*46' + entityEnd + '|�*2e' + entityEnd + ')'
},
charMap = {},
s = ents[' '] + '*', // short-hand for common use
any = '(?:[^>\"\']*(?:\"[^\"]*\"|\'[^\']*\'))*?[^>]*',
slashRE = null,
dotRE = null;
function ae(string) {
var allCharsLowerCase = string.toLowerCase(),
allCharsUpperCase = string.toUpperCase(),
reRes = '',
charLowerCase = null,
charUpperCase = null,
reSub = null,
i = null;
if (ents[string]) {
return ents[string];
}
for (i = 0; i < string.length; i++) {
charLowerCase = allCharsLowerCase.charAt(i);
if (charMap[charLowerCase]) {
reRes += charMap[charLowerCase];
continue;
}
charUpperCase = allCharsUpperCase.charAt(i);
reSub = [charLowerCase];
reSub.push('�*' + charLowerCase.charCodeAt(0) + entityEnd);
reSub.push('�*' + charLowerCase.charCodeAt(0).toString(16) + entityEnd);
if (charLowerCase !== charUpperCase) {
reSub.push('�*' + charUpperCase.charCodeAt(0) + entityEnd);
reSub.push('�*' + charUpperCase.charCodeAt(0).toString(16) + entityEnd);
}
reSub = '(?:' + reSub.join('|') + ')';
reRes += (charMap[charLowerCase] = reSub);
}
return (ents[string] = reRes);
}
function by(match, group1, group2, group3) {
return group1 + me.convertRelToAbsUrl(group2) + group3;
}
slashRE = new RegExp(ae('/'), 'g');
dotRE = new RegExp(ae('.'), 'g');
function by2(match, group1, group2, group3) {
group2 = group2.replace(slashRE, '/').replace(dotRE, '.');
return group1 + me.convertRelToAbsUrl(group2) + group3;
}
function cr(selector, attribute, marker, delimiter, end) {
var re1 = null,
re2 = null,
re3 = null;
if (typeof selector === 'string') {
selector = new RegExp(selector, 'gi');
}
attribute = att + attribute;
marker = typeof marker === 'string' ? marker : '\\s*=\\s*';
delimiter = typeof delimiter === 'string' ? delimiter : '';
end = typeof end === 'string' ? '?)(' + end : ')(';
re1 = new RegExp('(' + attribute + marker + '")([^"' + delimiter + ']+' + end + ')', 'gi');
re2 = new RegExp('(' + attribute + marker + '\')([^\'' + delimiter + ']+' + end + ')', 'gi');
re3 = new RegExp('(' + attribute + marker + ')([^"\'][^\\s>' + delimiter + ']*' + end + ')', 'gi');
html = html.replace(selector, function (match) {
return match.replace(re1, by).replace(re2, by).replace(re3, by);
});
}
function cri(selector, attribute, front, flags, delimiter, end) {
var re1 = null,
re2 = null,
at1 = null,
at2 = null,
at3 = null,
handleAttr = null;
if (typeof selector === 'string') {
selector = new RegExp(selector, 'gi');
}
attribute = att + attribute;
flags = typeof flags === 'string' ? flags : 'gi';
re1 = new RegExp('(' + attribute + '\\s*=\\s*")([^"]*)', 'gi');
re2 = new RegExp("(" + attribute + "\\s*=\\s*')([^']+)", 'gi');
at1 = new RegExp('(' + front + ')([^"]+)(")', flags);
at2 = new RegExp("(" + front + ")([^']+)(')", flags);
if (typeof delimiter === 'string') {
end = typeof end === 'string' ? end : '';
at3 = new RegExp('(' + front + ')([^\"\'][^' + delimiter + ']*' + (end ? '?)(' + end + ')' : ')()'), flags);
handleAttr = function (match, g1, g2) {
return g1 + g2.replace(at1, by2).replace(at2, by2).replace(at3, by2);
};
} else {
handleAttr = function (match, g1, g2) {
return g1 + g2.replace(at1, by2).replace(at2, by2);
};
}
html = html.replace(selector, function (match) {
return match.replace(re1, handleAttr).replace(re2, handleAttr);
});
}
cri('<meta' + any + att + 'http-equiv\\s*=\\s*(?:\"' + ae('refresh')
+ '\"' + any + '>|\'' + ae('refresh') + '\'' + any + '>|' + ae('refresh')
+ '(?:' + ae(' ') + any + '>|>))', 'content', ae('url') + s + ae('=') + s, 'i');
cr('<' + any + att + 'href\\s*=' + any + '>', 'href'); /* Linked elements */
cr('<' + any + att + 'src\\s*=' + any + '>', 'src'); /* Embedded elements */
cr('<object' + any + att + 'data\\s*=' + any + '>', 'data'); /* <object data= > */
cr('<applet' + any + att + 'codebase\\s*=' + any + '>', 'codebase'); /* <applet codebase= > */
/* <param name=movie value= >*/
cr('<param' + any + att + 'name\\s*=\\s*(?:\"' + ae('movie') + '\"' + any + '>|\''
+ ae('movie') + '\'' + any + '>|' + ae('movie') + '(?:' + ae(' ') + any + '>|>))', 'value');
cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi,
'url', '\\s*\\(\\s*', '', '\\s*\\)'); /* <style> */
cri('<' + any + att + 'style\\s*=' + any + '>', 'style',
ae('url') + s + ae('(') + s, 0, s + ae(')'), ae(')')); /*< style=" url(...) " > */
return html;
}
由羅布·W上評論有關基本標籤,我寫的注入功能:
function injectBase(html, base) {
// Remove any <base> elements inside <head>
html = html.replace(/(<[^>/]*head[^>]*>)[\s\S]*?(<[^>/]*base[^>]*>)[\s\S]*?(<[^>]*head[^>]*>)/img, "$1 $3");
// Add <base> just before </head>
html = html.replace(/(<[^>/]*head[^>]*>[\s\S]*?)(<[^>]*head[^>]*>)/img, "$1 " + base + " $2");
return(html);
}
我會使用一個真正的解析器,而不是一個正則表達式。有節點的html解析器。 – thejh