我有一个问题,我知道我想要抓取一个网站的信息,我也知道信息的位置。我知道它在哪个类中以及xpath。
我遇到的问题是无论我尝试什么,似乎都无法抓住内容。
这是我的刮刮功能:
function scrape(doc, url) {
var itemType = detectWeb(doc, doc.location.href);
var keywords = new Array();
var keywordText = doc.evaluate('//div[span="Index Terms:"]/div', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
if (keywordText) keywords = (Zotero.Utilities.trimInternal(keywordText.textContent.toLowerCase())).split(",");
var attachments = new Array();
var notes = new Array();
attachments.push({
document: doc,
mimeType: "text/html",
title: "IEEE Computer Snapshot"
});
var htmls = doc.evaluate('//img[@src="/plugins/images/digitalLibrary/dl_html_icon.gif"]/ancestor::a', doc, null, XPathResult.ANY_TYPE, null);
var htmlDoc;
//TESTING
//var affiliation = doc.getElementsByTagName('meta')[property='citation_author_institution'].content;
//var affiliations = [];
var abstracts;
if (htmlDoc = htmls.iterateNext()) {
//var urlField = htmlDoc.attributes.getNamedItem("onclick").value;
var urlField = htmlDoc.href;
urlField = urlField.substr(urlField.indexOf('"') + 1);
urlField = urlField.substr(0, urlField.indexOf('"'));
if (urlField.indexOf("?") > -1) {
urlField += '&' + templte;
} else {
urlField += '?' + templte;
}
urlField = "http://www2.computer.org" + urlField;
var mimeTypeField = "text/html";
var titleField = "IEEE Computer Full Text Snapshot";
var attachment = {
url: urlField,
mimeType: mimeTypeField,
title: titleField
};
attachments.push(attachment);
}
var pdfurl = ZU.xpathText(doc, '//div[@class="abs-pdf"]/a/@href')
if (pdfurl) {
var mimeTypeField = "application/pdf";
var titleField = "IEEE Computer Full Text PDF";
var attachment = {
url: pdfurl,
mimeType: mimeTypeField,
title: titleField
};
attachments.push(attachment);
} else {
notes.push({
note: "Complete PDF document was either not available or accessible. Please make sure you're logged in to the digital library to retrieve the complete PDF document."
});
}
var bibtex = doc.evaluate('//div[@id="bibText-content"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
var bibtexlink = ZU.xpathText(doc, '//li/a[contains(text(), "BibTex") and contains(@href, ".bib")]/@href')
if (bibtex) {
bibtex = bibtex.textContent;
//bibtex = bibtex.substring(bibtex.indexOf("document.write('")+16,bibtex.indexOf("');Popup.document.close();"));
//workaround as bibtex translator obviously needs a whitespace following the first curly brace
bibtex = Zotero.Utilities.cleanTags(bibtex);
bibtex = Zotero.Utilities.trimInternal(bibtex);
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(bibtex);
translator.setHandler("itemDone", function(obj, item) {
if (item.url) { // add http to url
item.url = "http://" + item.url;
}
if (itemType) item.itemType = itemType;
item.attachments = attachments;
if (keywords) item.tags = keywords;
if (notes) item.notes = notes;
if (item.DOI) item.DOI = item.DOI.replace(/^.*?10\./, "10.");
//Affiliations
/*if (affiliation)
{
for (i=0; i<affiliations.length; i++)
{
affiliation.push(affiliations[i].textContent)
}
item.extra = affiliation.join("; ");
}*/
if (abstracts) {
item.abstractNote = abstracts;
}
item.complete();
});
translator.translate();
} else if (bibtexlink) {
ZU.doGet(bibtexlink, function(text) {
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setString(text);
translator.setHandler("itemDone", function(obj, item) {
if (item.url) { // add http to url
item.url = "http://" + item.url;
}
if (itemType) item.itemType = itemType;
item.attachments = attachments;
if (keywords) item.tags = keywords;
if (notes) item.notes = notes;
if (item.DOI) item.DOI = item.DOI.replace(/^.*?10\./, "10.");
//Affiliations
/*if (affiliation)
{
for (i=0; i<affiliations.length; i++)
{
affiliation.push(affiliations[i].textContent)
}
item.extra = affiliation.join("; ");
}*/
//Abstract
if (abstracts) {
item.abstractNote = abstracts;
}
item.complete();
});
translator.translate();
})
} else {
throw "No BibTeX found!";
}
}
这是一个名为摘要的变量,我想填写这个网站的摘要。
我使用Firebug找到存储此信息的位置,并在class="article"
的{{1}}中找到该信息。
它看起来像这样:
div="tabs-main"
在Firebug中我也得到了XPath,它是:
<div id="tabs-main">
<!-- place holder -->
<div class="tab-content" id="articleDetails" role="main" data-section="articleDetails.ajax"
>
<div class="article-blk">
<div class="article">
(I want this)--> <p>Distributed database systems (DDBS) have received considerable attention in recent years. Being a relatively young research field, there are still many problems associated with DDB systems that need solution. Concurrency control is one of these problems and, probably, the most extensively studied. However, most of the work has concentrated on the development of alternative solutions and the field seems to be ready for some comparative analysis work. This paper reports the results of a performance evaluation study on distributed database concurrency control algorithms. The research has resulted in the development of a formalism, based on Petri nets, for modeling and analysis purposes. The formalism, called the Extended Place/Transition Nets (EPTN), is both descriptively powerful in that it can be used to model various algorithms precisely and succinctly and to communicate them in a clear manner, while at the same time lending itself to be used as a performance evaluation tool. An EPTN simulator is implemented and various algorithms are studied using this tool. This paper describes both the formalism and the performance results that have been obtained.</p>
</div>
但我不知道如何获得这些内容。我试过了
/html/body/div[2]/div[8]/div/div[2]/div/div[2]/div[1]/div/div[1]
我尝试过var abstracts = doc.querySelector(".article").innerHTML;
。
但我永远无法得到内容,var总是为空。 有人有想法吗?