如何获取所有html链接并存储在变量Jsoup中

时间:2018-02-01 12:23:23

标签: java html jsoup

如何从crawler(): listModel获取所选值的[href]链接并使用openie(): Jsoup.connect()中的链接?

这是我的代码:

private void btnExtractActionPerformed(java.awt.event.ActionEvent evt) {                                           
    try {
        openie();
    } catch (IOException ex) {
        Logger.getLogger(MainUI.class.getName()).log(Level.SEVERE, null, ex);
    }
} 

private void btnSearchActionPerformed(java.awt.event.ActionEvent evt) {                                          
        try {
            crawler();
        } catch (IOException ex) {
            Logger.getLogger(MainUI.class.getName()).log(Level.SEVERE, null, ex);
        }
    } 

private String subject, object, link;

private void crawler() throws IOException {
    //replace space with "+"
    String input = txtSearch.getText().replace(" ", "+");
    int count = 0;    
    DefaultListModel<String> listModel = new DefaultListModel<>();
    while (count <= 20) {
        String url = "https://www.google.com/search?q=" + input + "&tbm=nws&source=lnm&start=" + count;
        Document doc = Jsoup.connect(url).userAgent("Mozilla").timeout(10000).get();
        Elements e = doc.select("div.g a[href]");
        for (Element e1 : e) {
            listModel.addElement(e1.text());
            link = e1.absUrl("href");
        } 
        count += 10;
    }
    newsList.setModel(listModel);
}

private void openie() throws IOException {
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,natlog,openie");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    List<String> list = newsList.getSelectedValuesList();
    Document d = Jsoup.connect(link).userAgent("Mozilla").timeout(10000).get();
    DefaultListModel<String> extractedList = new DefaultListModel<>();
    for (String selected : list) {
        extractedList.addElement(selected);
        selected = d.body().text();
        Annotation doc = new Annotation(selected);
        pipeline.annotate(doc);
        for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
            // Get the OpenIE triples for the sentence
            Collection<RelationTriple> triples
                    = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
            // Print the triples
            for (RelationTriple triple : triples) {
                subject = triple.subjectLemmaGloss().replace(" ", "_").toLowerCase();
                object = triple.objectLemmaGloss().replace(" ", "_").toLowerCase();
                extractedList.addElement(triple.confidence + "\t"
                        + triple.subjectLemmaGloss() + "\t"
                        + triple.relationLemmaGloss() + "\t"
                        + triple.objectLemmaGloss());
            }
        }
    }
    tuplesList.setModel(extractedList);
}

Program

使用我的代码,我通过URL解析并能够获得[href]。 但是最后一个元素是遍历所有元素的[href]。我无法存储检索到的每个元素的[href]。

1 个答案:

答案 0 :(得分:0)

你需要List变量来存储url,如下所示。

Elements links = doc.select("div.g a[href]");
List<String> urls = links.stream().map(i -> i.absUrl("href")).collect(Collectors.toList());