前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。
一、首先介绍一下webmagic:
webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义ua/cookie等功能。
实现理念:
maven依赖:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
<dependency>
<groupid>us.codecraft</groupid>
<artifactid>webmagic-core</artifactid>
<version> 0.7 . 3 </version>
</dependency>
<dependency>
<groupid>us.codecraft</groupid>
<artifactid>webmagic-extension</artifactid>
<version> 0.7 . 3 </version>
</dependency>
<dependency>
<groupid>us.codecraft</groupid>
<artifactid>webmagic-extension</artifactid>
<version> 0.7 . 3 </version>
<exclusions>
<exclusion>
<groupid>org.slf4j</groupid>
<artifactid>slf4j-log4j12</artifactid>
</exclusion>
</exclusions>
</dependency>
|
jdbc模式:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
ublic class csdnblogdao {
private connection conn = null ;
private statement stmt = null ;
public csdnblogdao() {
try {
class .forname( "com.mysql.jdbc.driver" );
string url = "jdbc:mysql://localhost:3306/test?"
+ "user=***&password=***3&useunicode=true&characterencoding=utf8" ;
conn = drivermanager.getconnection(url);
stmt = conn.createstatement();
} catch (classnotfoundexception e) {
e.printstacktrace();
} catch (sqlexception e) {
e.printstacktrace();
}
}
public int add(csdnblog csdnblog) {
try {
string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);" ;
preparedstatement ps = conn.preparestatement(sql);
ps.setint( 1 , csdnblog.getkey());
ps.setstring( 2 , csdnblog.gettitle());
ps.setstring( 3 ,csdnblog.getcontent());
ps.setstring( 4 , csdnblog.getdates());
ps.setstring( 5 , csdnblog.gettags());
ps.setstring( 6 , csdnblog.getcategory());
ps.setint( 7 , csdnblog.getview());
ps.setint( 8 , csdnblog.getcomments());
ps.setint( 9 , csdnblog.getcopyright());
return ps.executeupdate();
} catch (sqlexception e) {
e.printstacktrace();
}
return - 1 ;
}
}
|
实体类:
- publicclasscsdnblog{
- privateintkey;//编号
- privatestringtitle;//标题
- privatestringdates;//日期
- privatestringtags;//标签
- privatestringcategory;//分类
- privateintview;//阅读人数
- privateintcomments;//评论人数
- privateintcopyright;//是否原创
- privatestringcontent;//文字内容
- publicstringgetcontent(){
- returncontent;
- }
- publicvoidsetcontent(stringcontent){
- this.content=content;
- }
- publicintgetkey(){
- returnkey;
- }
- publicvoidsetkey(intkey){
- this.key=key;
- }
- publicstringgettitle(){
- returntitle;
- }
- publicvoidsettitle(stringtitle){
- this.title=title;
- }
- publicstringgetdates(){
- returndates;
- }
- publicvoidsetdates(stringdates){
- this.dates=dates;
- }
- publicstringgettags(){
- returntags;
- }
- publicvoidsettags(stringtags){
- this.tags=tags;
- }
- publicstringgetcategory(){
- returncategory;
- }
- publicvoidsetcategory(stringcategory){
- this.category=category;
- }
- publicintgetview(){
- returnview;
- }
- publicvoidsetview(intview){
- this.view=view;
- }
- publicintgetcomments(){
- returncomments;
- }
- publicvoidsetcomments(intcomments){
- this.comments=comments;
- }
- publicintgetcopyright(){
- returncopyright;
- }
- publicvoidsetcopyright(intcopyright){
- this.copyright=copyright;
- }
- publicstringtostring(){
- return"csdnblog[key="+key+",title="+title+",content="+content+",dates="+dates+",tags="+tags+",category="
- +category+",view="+view+",comments="+comments+",copyright="+copyright+"]";
- }
- }
启动类:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
public class csdnblogpageprocessor implements pageprocessor {
private static string username= "chenyufeng1991" ; // 设置csdn用户名
private static int size = 0 ; // 共抓取到的文章数量
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 );
public site getsite() {
return site;
}
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(page page) {
// 列表页
if (!page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/d+" ).match()) {
// 添加所有文章页
page.addtargetrequests(page.gethtml().xpath( "//div[@id='article_list']" ).links()// 限定文章列表获取区域
.regex( "/" + username + "/article/details/d+" )
.replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url
.all());
// 添加其他列表页
page.addtargetrequests(page.gethtml().xpath( "//div[@id='papelist']" ).links()// 限定其他列表页获取区域
.regex( "/" + username + "/article/list/d+" )
.replace( "/" + username + "/" , "http://blog.csdn.net/" + username + "/" )// 巧用替换给把相对url转换成绝对url
.all());
// 文章页
} else {
size++; // 文章数量加1
// 用csdnblog类来存抓取到的数据,方便存入数据库
csdnblog csdnblog = new csdnblog();
// 设置编号
csdnblog.setkey(integer.parseint(
page.geturl().regex( "http://blog.csdn.net/" + username + "/article/details/(d+)" ).get()));
// 设置标题
csdnblog.settitle(
page.gethtml().xpath( "//div[@class='article_title']//span[@class='link_title']/a/text()" ).get());
//设置内容
csdnblog.setcontent(
page.gethtml().xpath( "//div[@class='article_content']/alltext()" ).get());
// 设置日期
csdnblog.setdates(
page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_postdate']/text()" ).get());
// 设置标签(可以有多个,用,来分割)
csdnblog.settags(listtostring(page.gethtml().xpath( "//div[@class='article_l']/span[@class='link_categories']/a/alltext()" ).all()));
// 设置类别(可以有多个,用,来分割)
csdnblog.setcategory(listtostring(page.gethtml().xpath( "//div[@class='category_r']/label/span/text()" ).all()));
// 设置阅读人数
csdnblog.setview(integer.parseint(page.gethtml().xpath( "//div[@class='article_r']/span[@class='link_view']" )
.regex( "(d+)人阅读" ).get()));
// 设置评论人数
csdnblog.setcomments(integer.parseint(page.gethtml()
.xpath( "//div[@class='article_r']/span[@class='link_comments']" ).regex( "((d+))" ).get()));
// 设置是否原创
csdnblog.setcopyright(page.gethtml().regex( "bog_copyright" ).match() ? 1 : 0 );
// 把对象存入数据库
new csdnblogdao().add(csdnblog);
// 把对象输出控制台
system.out.println(csdnblog);
}
}
// 把list转换为string,用,分割
public static string listtostring(list<string> stringlist) {
if (stringlist == null ) {
return null ;
}
stringbuilder result = new stringbuilder();
boolean flag = false ;
for (string string : stringlist) {
if (flag) {
result.append( "," );
} else {
flag = true ;
}
result.append(string);
}
return result.tostring();
}
public static void main(string[] args) {
long starttime, endtime;
system.out.println( "【爬虫开始】..." );
starttime = system.currenttimemillis();
// 从用户博客首页开始抓,开启5个线程,启动爬虫
spider.create( new csdnblogpageprocessor()).addurl( "http://blog.csdn.net/" + username).thread( 5 ).run();
endtime = system.currenttimemillis();
system.out.println( "【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endtime - starttime) / 1000 ) + "秒,已保存到数据库,请查收!" );
}
}
|
使用mysql类型:
?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
public class gamepageprocessor implements pageprocessor {
private static final logger logger = loggerfactory.getlogger(gamepageprocessor. class );
private static dianjingservice d;
private static bannerservice bs;
private static sportservice ss;
private static yulenewsservice ys;
private static updateservice ud ;
// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private site site = site.me().setretrytimes( 3 ).setsleeptime( 1000 );
public site getsite() {
return site;
}
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public static void main(string[] args) {
configurableapplicationcontext context= springapplication.run(gamepageprocessor. class , args);
d = context.getbean(dianjingservice. class );
//spider.create(new gamepageprocessor()).addurl("网址").thread(5).run();
}
public void process(page page) {
selectable url = page.geturl();
if (url.tostring().equals( "网址" )) {
dianjingvideo dv = new dianjingvideo();
list<string> ls = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()" ).all();
//hrefs
list<string> ls1 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-link']/a/@href" ).all();//获取a标签的href
list<string> ls2 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()" ).all();
//photo
list<string> ls3 = page.gethtml().xpath( "//div[@class='v']/div[@class='v-thumb']/img/@src" ).all();
for ( int i = 0 ; i < 5 ; i++) {
dv.settitles(ls.get(i));
dv.setcategory( "" );
dv.setdates(ls2.get(i));
dv.sethrefs(ls1.get(i));
dv.setphoto(ls3.get(i));
dv.setsources( "" );
d.addvideo(dv);
}
}
}
|
controller:
- @controller
- @requestmapping(value="/dianjing")
- publicclassdianjingcontroller{
- @autowired
- privatedianjingservices;
- /*
- 手游
- */
- @requestmapping("/dianjing")
- @responsebody
- publicobjectdianjing(){
- list<dianjing>list=s.find2();
- jsonobjectjo=newjsonobject();
- if(list!=null){
- jo.put("code",0);
- jo.put("success",true);
- jo.put("count",list.size());
- jo.put("list",list);
- }
- returnjo;
- }
- }
实体类就不展示了
dao层
?
1
2
|
@insert ( "insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})" )
int adddj(dianjing dj);
|
以上这篇springboot+webmagic实现java爬虫jdbc及mysql的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持快网idc。
原文链接:https://www.cnblogs.com/NCL–/p/8608336.html
相关文章
猜你喜欢
- ASP.NET本地开发时常见的配置错误及解决方法? 2025-06-10
- ASP.NET自助建站系统的数据库备份与恢复操作指南 2025-06-10
- 个人网站服务器域名解析设置指南:从购买到绑定全流程 2025-06-10
- 个人网站搭建:如何挑选具有弹性扩展能力的服务器? 2025-06-10
- 个人服务器网站搭建:如何选择适合自己的建站程序或框架? 2025-06-10
TA的动态
- 2025-07-10 怎样使用阿里云的安全工具进行服务器漏洞扫描和修复?
- 2025-07-10 怎样使用命令行工具优化Linux云服务器的Ping性能?
- 2025-07-10 怎样使用Xshell连接华为云服务器,实现高效远程管理?
- 2025-07-10 怎样利用云服务器D盘搭建稳定、高效的网站托管环境?
- 2025-07-10 怎样使用阿里云的安全组功能来增强服务器防火墙的安全性?
快网idc优惠网
QQ交流群
您的支持,是我们最大的动力!
热门文章
-
2025-06-04 69
-
Asp.net开发之webform图片水印和图片验证码的实现方法
2025-05-29 50 -
2025-05-27 23
-
2025-05-25 100
-
2025-05-29 21
热门评论