java代理实现爬取代理IP的示例

2025-05-29 0 59

仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和jsoup(版本1.10.2)

如果用了pom,那么就是以下两个:

?

1

2

3

4

5

6

7

8

9

10
<dependency>

<groupid>com.alibaba</groupid>

<artifactid>fastjson</artifactid>

<version>1.2.28</version>

</dependency>

<dependency>

<groupid>org.jsoup</groupid>

<artifactid>jsoup</artifactid>

<version>1.10.2</version>

</dependency>

完整的代码如下:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184
package com.tuniu.fcm.facade.ipproxy;

import com.alibaba.fastjson.jsonobject;

import org.jsoup.jsoup;

import org.jsoup.nodes.document;

import java.util.arraylist;

import java.util.hashmap;

import java.util.list;

import java.util.map;

import java.util.regex.matcher;

import java.util.regex.pattern;

/**

* 获取代理ip,需要

* com.alibaba.fastjson.jsonobject以及jsoup

*/

public class proxycralwerunusedvpn {

threadlocal<integer> localwantednumber = new threadlocal<integer>();

threadlocal<list<proxyinfo>> localproxyinfos = new threadlocal<list<proxyinfo>>();

public static void main(string[] args) {

proxycralwerunusedvpn proxycrawler = new proxycralwerunusedvpn();

/**

* 想要获取的代理ip个数,由需求方自行指定。(如果个数太多,将导致返回变慢)

*/

proxycrawler.startcrawler(1);

}

/**

* 暴露给外部模块调用的入口

* @param wantednumber 调用方期望获取到的代理ip个数

*/

public string startcrawler(int wantednumber) {

localwantednumber.set(wantednumber);

kuaidailicom("http://www.xicidaili.com/nn/", 15);

kuaidailicom("http://www.xicidaili.com/nt/", 15);

kuaidailicom("http://www.xicidaili.com/wt/", 15);

kuaidailicom("http://www.kuaidaili.com/free/inha/", 15);

kuaidailicom("http://www.kuaidaili.com/free/intr/", 15);

kuaidailicom("http://www.kuaidaili.com/free/outtr/", 15);

/**

* 构造返回数据

*/

proxyresponse response = new proxyresponse();

response.setsuccess("true");

map<string, object> datainfomap = new hashmap<string, object>();

datainfomap.put("numfound", localproxyinfos.get().size());

datainfomap.put("pagenum", 1);

datainfomap.put("proxy", localproxyinfos.get());

response.setdata(datainfomap);

string responsestring = jsonobject.tojson(response).tostring();

system.out.println(responsestring);

return responsestring;

}

private void kuaidailicom(string baseurl, int totalpage) {

string ipreg = "\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3}\\\\.\\\\d{1,3} \\\\d{1,6}";

pattern ipptn = pattern.compile(ipreg);

for (int i = 1; i < totalpage; i++) {

if (getcurrentproxynumber() >= localwantednumber.get()) {

return;

}

try {

document doc = jsoup.connect(baseurl + i + "/")

.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")

.header("accept-encoding", "gzip, deflate, sdch")

.header("accept-language", "zh-cn,zh;q=0.8,en;q=0.6")

.header("cache-control", "max-age=0")

.header("user-agent", "mozilla/5.0 (macintosh; intel mac os x 10_11_4) applewebkit/537.36 (khtml, like gecko) chrome/51.0.2704.103 safari/537.36")

.header("cookie", "hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=ga1.2.1061361785.1462812244")

.header("host", "www.kuaidaili.com")

.header("referer", "http://www.kuaidaili.com/free/outha/")

.timeout(30 * 1000)

.get();

matcher m = ipptn.matcher(doc.text());

while (m.find()) {

if (getcurrentproxynumber() >= localwantednumber.get()) {

break;

}

string[] strs = m.group().split(" ");

if (checkproxy(strs[0], integer.parseint(strs[1]))) {

system.out.println("获取到可用代理ip\\t" + strs[0] + "\\t" + strs[1]);

addproxy(strs[0], strs[1], "http");

}

}

} catch (exception e) {

e.printstacktrace();

}

}

}

private static boolean checkproxy(string ip, integer port) {

try {

//http://1212.ip138.com/ic.asp 可以换成任何比较快的网页

jsoup.connect("http://1212.ip138.com/ic.asp")

.timeout(2 * 1000)

.proxy(ip, port)

.get();

return true;

} catch (exception e) {

return false;

}

}

private int getcurrentproxynumber() {

list<proxyinfo> proxyinfos = localproxyinfos.get();

if (proxyinfos == null) {

proxyinfos = new arraylist<proxyinfo>();

localproxyinfos.set(proxyinfos);

return 0;

}

else {

return proxyinfos.size();

}

}

private void addproxy(string ip, string port, string protocol){

list<proxyinfo> proxyinfos = localproxyinfos.get();

if (proxyinfos == null) {

proxyinfos = new arraylist<proxyinfo>();

proxyinfos.add(new proxyinfo(ip, port, protocol));

}

else {

proxyinfos.add(new proxyinfo(ip, port, protocol));

}

}

}

class proxyinfo {

private string username = "";

private string ip;

private string password = "";

private string type;

private string port;

private int is_internet = 1;

public proxyinfo(string ip, string port, string type) {

this.ip = ip;

this.type = type;

this.port = port;

}

public string getusername() {

return username;

}

public void setusername(string username) {

this.username = username;

}

public string getip() {

return ip;

}

public void setip(string ip) {

this.ip = ip;

}

public string getpassword() {

return password;

}

public void setpassword(string password) {

this.password = password;

}

public string gettype() {

return type;

}

public void settype(string type) {

this.type = type;

}

public string getport() {

return port;

}

public void setport(string port) {

this.port = port;

}

public int getis_internet() {

return is_internet;

}

public void setis_internet(int is_internet) {

this.is_internet = is_internet;

}

}

class proxyresponse {

private string success;

private map<string, object> data;

public string getsuccess() {

return success;

}

public void setsuccess(string success) {

this.success = success;

}

public map<string, object> getdata() {

return data;

}

public void setdata(map<string, object> data) {

this.data = data;

}

}

以上这篇java代理实现爬取代理ip的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持快网idc。

原文链接:https://blog.csdn.net/sdfiiiiii/article/details/70432060

收藏 (0) 打赏

感谢您的支持,我会继续努力的!

打开微信/支付宝扫一扫,即可进行扫码打赏哦,分享从这里开始,精彩与您同在
点赞 (0)

声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。

快网idc优惠网 建站教程 java代理实现爬取代理IP的示例 https://www.kuaiidc.com/111752.html

相关文章

发表评论
暂无评论