正在尝试从网页中提取电子邮件;我有60个随机网站网址,并尝试从中提取电子邮件以进行测试,正在使用此 [A-Z0-9 ._%+-] + @ [A-Z0-9 .-] {3,65 }。[AZ] {2,4} 正则表达式来查找页面中的电子邮件,并正在使用JSoup解析网站。
一个工作源中的编辑代码
import java.io.IOException;
import java.net.MalformedURLException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class TestingMail {
// HERE WE CONSTRUCT CLASS
public TestingMail(){}
/****************** SETTING MAIN METHOD TO TEST CLASS *************************/
public static void main(String[] args){
// Setting initiator
String Terms="Trending Bitcoin Investment Chat in NETHERLANDS";
TestingMail extractor=new TestingMail();
extractor.extract(Terms, extractor);
}
/****************** HERE WE CONSTRUCT THE EXTRACT METHOD **********************/
public void extract(String terms, TestingMail extractor){
// HERE WE START CONSTRUCTING THE EXTRACT PROCESSES
int NUM_THREADS=10;
int limit=10;
String[] parseURL={};
String[] crawedURL={};
int istype=0;
int start=0;
// HERE WE START PROCESSING
if(terms!=null && terms.length()>0){
SSLContext sc = null;
// LETS DISABLE SSL CERTIFICATE
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}
};
try {
sc = SSLContext.getInstance("SSL");
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sc.init(null, trustAllCerts, new java.security.SecureRandom());
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
// Create all-trusting host name verifier
HostnameVerifier allHostsValid = new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
};
// Install the all-trusting host verifier
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
// HERE LETS CRAW DATA FROM GOOGLE
crawedURL=new String[]{"https://www.globfinances.com", "https://napoleoninvestment.net", "https://www.meetup.com/BitcoinWednesday/?_cookie-check=PXZ_aLyoOMcdpbrs"};
if(crawedURL!=null && crawedURL.length>0){
// Here we loop mails to store send mails
if(crawedURL.length<limit){
limit=crawedURL.length;
istype=1;
}
// Here we set the mails length
parseURL=new String[limit];
// HERE WE START THREAD POOL
ExecutorService es = Executors.newFixedThreadPool(NUM_THREADS);
List<Future<Integer>> futures = new ArrayList<>(NUM_THREADS);
// Submit task to every thread:
for (int i = 0; i < NUM_THREADS; i++) {
// Here we loop to get mails
if(start<crawedURL.length){
for(int k=start, j=0; j<crawedURL.length; k++, j++){
if(k<(limit-1)){
System.out.println(i+"=="+j);
// System.out.println(mails[k]);
parseURL[j]=crawedURL[k];
}
else{
start+=limit+1;
break;
}
}
// Here we thread task
futures.add(i, es.submit((Callable<Integer>) new Extractor(parseURL, extractor)));
}
else{
istype=1;
break;
}
// Checking thread type to prevent multiple run
if(istype==1){
break;
}
} // end of loop
// Shutdown thread pool
es.shutdown();
System.out.println("Thread: "+futures.size());
}
}
}
/******************* HERE WE CONSTRUCT THE EXTRACT METHOD *******************/
private Integer mailExtract(String[] urls) throws MalformedURLException{
// HERE WE START CONSTRUCTING THE MAIL EXTRACTED PROCESS
String pattern = "\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b";
Set<String> emails = new HashSet<>();
String[][] extracted={};
int totalMails=0;
// HERE WE START PROCESSING
if(urls!=null && urls.length>0){
extracted=new String[urls.length][];
// Now lets extract mails
Pattern pat = Pattern.compile(pattern);
// Now lets loop
for(int i=0; i<urls.length; i++){
emails=parse(urls[i], pat);
int key=0;
if(emails.size()>0){
for(String email:emails){
extracted[i][key]=email;
key++;
} // end of loop
}
} // end of loop
// HERE WE CHECK EXTRACTED LENGTH
for(int j=0; j<extracted.length; j++){
totalMails=totalMails+extracted[j].length;
} // end of loop
System.out.println(totalMails);
}
// Here we return
return Integer.valueOf(totalMails);
}
/********* HERE WE START CONSTRUCTING THE PARSE FUNCTIONS **********/
public Set<String> parse(String url, Pattern pat){
// HERE WE CONSTRUCT THE EMAIL PARSER PROCESS
Set<String> emailAddresses = new HashSet<>();
boolean found=false;
String contents="";
// HERE WE START PROCESSING
if(url!=null){
contents=urlContent(url);
if(contents.length()>0 && contents.indexOf("body")>=0){
// Pattern pat = Pattern.compile(pattern);
//Matches contents against the given Email Address Pattern
Matcher match = pat.matcher(contents);
found=match.find();
//If match found, append to emailAddresses
System.out.println("I found this: "+found);
while(found) {
emailAddresses.add(match.group());
} // end of while loop
}
}
// Here we return
return emailAddresses;
}
// HERE WE READ URL CONTENT TO STRING
private String urlContent(String url){
// HERE WE CONSTRUCT THE URL CONTENT RETURNER
String content="";
Document doc=null;
String sUrl="";
// HERE WE START PROCESSING
try {
SSLContext sc = null;
// LETS DISABLE SSL CERTIFICATE
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}
};
try {
sc = SSLContext.getInstance("SSL");
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sc.init(null, trustAllCerts, new java.security.SecureRandom());
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
// Create all-trusting host name verifier
HostnameVerifier allHostsValid = new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
};
// Install the all-trusting host verifier
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
// HERE WE START CRAWLING
if(url.startsWith("http")){
Connection con=Jsoup.connect(url).timeout(100000).ignoreHttpErrors(true).followRedirects(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)");
Connection.Response resp = con.execute();
// HERE WE CHECK RESPONSE CODE
if (resp.statusCode() == 200) {
doc = con.get();
// Now lets get the text document
content=doc.html();
} // End of status check
else if(resp.statusCode() == 307){
String sNewUrl = resp.header("Location");
if (sNewUrl != null && sNewUrl.length() > 7)
sUrl = sNewUrl;
resp = Jsoup.connect(sUrl).timeout(100000).ignoreHttpErrors(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)").execute();
doc =resp.parse();
// Now lets get the text document
content=doc.html();
} // End of status 307 check
} // end of start with check
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Here we return
return content;
}
/************* HERE WE CONSTRUCT INNER CLASS TO HANDLE THREAD *****************/
public static final class Extractor implements Callable<Integer>{
// HERE WE CONSTRUCT CLASS
String[] Urls;
TestingMail Extract;
public Extractor(String[] urls, TestingMail extract){
Urls=urls;
Extract=extract;
}
/*********** HERE WE CALL THE CALLABLE ***********/
@Override
public Integer call() throws Exception {
try {
return Extract.mailExtract(Urls);
} catch (Throwable t) {
t.printStackTrace();
throw new RuntimeException(t);
}
}
// END OF CLASS
}
// END OF CLASS
}
我添加了一些打印语句来监视该过程,并且通过Java模式匹配,我一直得到的都是错误的
这是我控制台中的内容
52
0==0
0==1
0==2
0==3
0==4
0==5
0==6
0==7
0==8
Thread: 5 Extracted Mails: 0
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
我似乎无法理解为什么不收到电子邮件,以上网站中至少有一个在页脚中包含支持电子邮件,但我的代码似乎无法收到。 我什至将表情改为: \ b [a-zA-Z0-9 .-] + @。[a-zA-Z0-9.- ] + \ b ,但结果相同。我不知道丢失了什么,或者为什么表示不起作用。
我们将不胜感激
答案 0 :(得分:2)
对于任何编程问题,最有帮助的功能是Minimal, Complete, and Verifiable example。这是您遇到的问题:
import java.util.regex.*;
class Test {
public static void main(String[] args) {
Pattern pat = Pattern.compile("\\\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\\\.[a-zA-Z0-9.-]+\\\\b");
Matcher match = pat.matcher("<li>email@example.com</li>");
System.out.println("I found this: "+ match.find() + " with expression: " + pat);
}
}
它更短,但结果与代码相同:
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
问题在于反斜杠是两次转义的。这是没有多余转义的版本:
import java.util.regex.*;
class Test {
public static void main(String[] args) {
Pattern pat = Pattern.compile("\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b");
Matcher match = pat.matcher("<li>info@napoleoninvestment.net</li>");
System.out.println("I found this: "+ match.find() + " with expression: " + pat);
}
}
这是输出,现在显示一个匹配项:
I found this: true with expression: \b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.-]+\b
不幸的是,我不知道如何将其应用于您的代码,因为您没有包括定义pattern
的部分。这很可能是由于对代码的哪一层需要进行什么转义感到困惑。例如,将Java字符串文字复制粘贴到文件中时,回读时将 not 导致相同的字符串文字,因为一个是Java语法,一个是原始数据,而后者则不需要或不允许转义。
答案 1 :(得分:1)
您的正则表达式“很好”,通常您的代码对我有用。它会下载页面并进行一些更正后查找电子邮件。您下载的页面内容中一定有东西。尝试显示下载页面的HTML,以确保获得正确的响应。
无论如何,几乎没有错误:
()
”,但是您正在呼叫matcher.group(1)
。这将导致异常。请改用matcher.group()
。答案 2 :(得分:0)
答案 3 :(得分:0)
您的正则表达式不允许使用小写字母。那可以解释为什么它与大多数电子邮件地址不匹配。
要解决此问题,可以使用CASE_INSENSITIVE
标志对其进行编译。 (请参见https://stackoverflow.com/a/22991877/423105)