仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)
如果用了pom,那么就是以下两个:
?
1
2
3
4
5
6
7
8
9
10
|
<dependency>
<groupid>com.alibaba</groupid>
<artifactid>fastjson</artifactid>
<version> 1.2 . 28 </version>
</dependency>
<dependency>
<groupid>org.jsoup</groupid>
<artifactid>jsoup</artifactid>
<version> 1.10 . 2 </version>
</dependency>
|
完整的代码如下:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
package com.tuniu.fcm.facade.ipproxy;
import com.alibaba.fastjson.jsonobject;
import org.jsoup.jsoup;
import org.jsoup.nodes.document;
import java.util.arraylist;
import java.util.hashmap;
import java.util.list;
import java.util.map;
import java.util.regex.matcher;
import java.util.regex.pattern;
/**
* 获取代理ip,需要
* com.alibaba.fastjson.jsonobject以及jsoup
*/
public class proxycralwerunusedvpn {
threadlocal<integer> localwantednumber = new threadlocal<integer>();
threadlocal<list<proxyinfo>> localproxyinfos = new threadlocal<list<proxyinfo>>();
public static void main(string[] args) {
proxycralwerunusedvpn proxycrawler = new proxycralwerunusedvpn();
/**
* 想要获取的代理ip个数,由需求方自行指定。(如果个数太多,将导致返回变慢)
*/
proxycrawler.startcrawler( 1 );
}
/**
* 暴露给外部模块调用的入口
* @param wantednumber 调用方期望获取到的代理ip个数
*/
public string startcrawler( int wantednumber) {
localwantednumber.set(wantednumber);
kuaidailicom( "http://www.xicidaili.com/nn/" , 15 );
kuaidailicom( "http://www.xicidaili.com/nt/" , 15 );
kuaidailicom( "http://www.xicidaili.com/wt/" , 15 );
kuaidailicom( "http://www.kuaidaili.com/free/inha/" , 15 );
kuaidailicom( "http://www.kuaidaili.com/free/intr/" , 15 );
kuaidailicom( "http://www.kuaidaili.com/free/outtr/" , 15 );
/**
* 构造返回数据
*/
proxyresponse response = new proxyresponse();
response.setsuccess( "true" );
map<string, object> datainfomap = new hashmap<string, object>();
datainfomap.put( "numfound" , localproxyinfos.get().size());
datainfomap.put( "pagenum" , 1 );
datainfomap.put( "proxy" , localproxyinfos.get());
response.setdata(datainfomap);
string responsestring = jsonobject.tojson(response).tostring();
system.out.println(responsestring);
return responsestring;
}
private void kuaidailicom(string baseurl, int totalpage) {
string ipreg = "\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3} \\\\d{1,6}" ;
pattern ipptn = pattern.compile(ipreg);
for ( int i = 1 ; i < totalpage; i++) {
if (getcurrentproxynumber() >= localwantednumber.get()) {
return ;
}
try {
document doc = jsoup.connect(baseurl + i + "/" )
.header( "accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" )
.header( "accept-encoding" , "gzip, deflate, sdch" )
.header( "accept-language" , "zh-cn,zh;q=0.8,en;q=0.6" )
.header( "cache-control" , "max-age=0" )
.header( "user-agent" , "mozilla/5.0 (macintosh; intel mac os x 10_11_4) applewebkit/537.36 (khtml, like gecko) chrome/51.0.2704.103 safari/537.36" )
.header( "cookie" , "hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=ga1.2.1061361785.1462812244" )
.header( "host" , "www.kuaidaili.com" )
.header( "referer" , "http://www.kuaidaili.com/free/outha/" )
.timeout( 30 * 1000 )
.get();
matcher m = ipptn.matcher(doc.text());
while (m.find()) {
if (getcurrentproxynumber() >= localwantednumber.get()) {
break ;
}
string[] strs = m.group().split( " " );
if (checkproxy(strs[ 0 ], integer.parseint(strs[ 1 ]))) {
system.out.println( "获取到可用代理ip\\t" + strs[ 0 ] + "\\t" + strs[ 1 ]);
addproxy(strs[ 0 ], strs[ 1 ], "http" );
}
}
} catch (exception e) {
e.printstacktrace();
}
}
}
private static boolean checkproxy(string ip, integer port) {
try {
//http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
jsoup.connect( "http://1212.ip138.com/ic.asp" )
.timeout( 2 * 1000 )
.proxy(ip, port)
.get();
return true ;
} catch (exception e) {
return false ;
}
}
private int getcurrentproxynumber() {
list<proxyinfo> proxyinfos = localproxyinfos.get();
if (proxyinfos == null ) {
proxyinfos = new arraylist<proxyinfo>();
localproxyinfos.set(proxyinfos);
return 0 ;
}
else {
return proxyinfos.size();
}
}
private void addproxy(string ip, string port, string protocol){
list<proxyinfo> proxyinfos = localproxyinfos.get();
if (proxyinfos == null ) {
proxyinfos = new arraylist<proxyinfo>();
proxyinfos.add( new proxyinfo(ip, port, protocol));
}
else {
proxyinfos.add( new proxyinfo(ip, port, protocol));
}
}
}
class proxyinfo {
private string username = "" ;
private string ip;
private string password = "" ;
private string type;
private string port;
private int is_internet = 1 ;
public proxyinfo(string ip, string port, string type) {
this .ip = ip;
this .type = type;
this .port = port;
}
public string getusername() {
return username;
}
public void setusername(string username) {
this .username = username;
}
public string getip() {
return ip;
}
public void setip(string ip) {
this .ip = ip;
}
public string getpassword() {
return password;
}
public void setpassword(string password) {
this .password = password;
}
public string gettype() {
return type;
}
public void settype(string type) {
this .type = type;
}
public string getport() {
return port;
}
public void setport(string port) {
this .port = port;
}
public int getis_internet() {
return is_internet;
}
public void setis_internet( int is_internet) {
this .is_internet = is_internet;
}
}
class proxyresponse {
private string success;
private map<string, object> data;
public string getsuccess() {
return success;
}
public void setsuccess(string success) {
this .success = success;
}
public map<string, object> getdata() {
return data;
}
public void setdata(map<string, object> data) {
this .data = data;
}
}
|
以上这篇java代理实现爬取代理ip的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持快网idc。
原文链接:https://blog.csdn.net/sdfiiiiii/article/details/70432060
相关文章
猜你喜欢
- ASP.NET自助建站系统中如何实现多语言支持? 2025-06-10
- 64M VPS建站:如何选择最适合的网站建设平台? 2025-06-10
- ASP.NET本地开发时常见的配置错误及解决方法? 2025-06-10
- ASP.NET自助建站系统的数据库备份与恢复操作指南 2025-06-10
- 个人网站服务器域名解析设置指南:从购买到绑定全流程 2025-06-10
TA的动态
- 2025-07-10 怎样使用阿里云的安全工具进行服务器漏洞扫描和修复?
- 2025-07-10 怎样使用命令行工具优化Linux云服务器的Ping性能?
- 2025-07-10 怎样使用Xshell连接华为云服务器,实现高效远程管理?
- 2025-07-10 怎样利用云服务器D盘搭建稳定、高效的网站托管环境?
- 2025-07-10 怎样使用阿里云的安全组功能来增强服务器防火墙的安全性?
快网idc优惠网
QQ交流群
您的支持,是我们最大的动力!
热门文章
-
2025-05-29 65
-
在中国市场上,WooCommerce和Ueeshop哪个电商平台更适合卖家搭建自己的在线商店?
2025-05-25 16 -
Linux服务器如何进行性能监控和调优?有哪些常见的性能优化技巧?
2025-05-25 64 -
2025-06-05 98
-
2025-05-25 87
热门评论