springboot+webmagic实现java爬虫jdbc及mysql的方法

2025-05-29 0 66

前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。

一、首先介绍一下webmagic

webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义ua/cookie等功能。

实现理念:

springboot+webmagic实现java爬虫jdbc及mysql的方法

maven依赖:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22
<dependency>

<groupid>us.codecraft</groupid>

<artifactid>webmagic-core</artifactid>

<version>0.7.3</version>

</dependency>

<dependency>

<groupid>us.codecraft</groupid>

<artifactid>webmagic-extension</artifactid>

<version>0.7.3</version>

</dependency>

<dependency>

<groupid>us.codecraft</groupid>

<artifactid>webmagic-extension</artifactid>

<version>0.7.3</version>

<exclusions>

<exclusion>

<groupid>org.slf4j</groupid>

<artifactid>slf4j-log4j12</artifactid>

</exclusion>

</exclusions>

</dependency>

jdbc模式:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39
ublic class csdnblogdao {

private connection conn = null;

private statement stmt = null;

public csdnblogdao() {

try {

class.forname("com.mysql.jdbc.driver");

string url = "jdbc:mysql://localhost:3306/test?"

+ "user=***&password=***3&useunicode=true&characterencoding=utf8";

conn = drivermanager.getconnection(url);

stmt = conn.createstatement();

} catch (classnotfoundexception e) {

e.printstacktrace();

} catch (sqlexception e) {

e.printstacktrace();

}

}

public int add(csdnblog csdnblog) {

try {

string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);";

preparedstatement ps = conn.preparestatement(sql);

ps.setint(1, csdnblog.getkey());

ps.setstring(2, csdnblog.gettitle());

ps.setstring(3,csdnblog.getcontent());

ps.setstring(4, csdnblog.getdates());

ps.setstring(5, csdnblog.gettags());

ps.setstring(6, csdnblog.getcategory());

ps.setint(7, csdnblog.getview());

ps.setint(8, csdnblog.getcomments());

ps.setint(9, csdnblog.getcopyright());

return ps.executeupdate();

} catch (sqlexception e) {

e.printstacktrace();

}

return -1;

}

}

实体类:

  1. publicclasscsdnblog{
  2. privateintkey;//编号
  3. privatestringtitle;//标题
  4. privatestringdates;//日期
  5. privatestringtags;//标签
  6. privatestringcategory;//分类
  7. privateintview;//阅读人数
  8. privateintcomments;//评论人数
  9. privateintcopyright;//是否原创
  10. privatestringcontent;//文字内容
  11. publicstringgetcontent(){
  12. returncontent;
  13. }
  14. publicvoidsetcontent(stringcontent){
  15. this.content=content;
  16. }
  17. publicintgetkey(){
  18. returnkey;
  19. }
  20. publicvoidsetkey(intkey){
  21. this.key=key;
  22. }
  23. publicstringgettitle(){
  24. returntitle;
  25. }
  26. publicvoidsettitle(stringtitle){
  27. this.title=title;
  28. }
  29. publicstringgetdates(){
  30. returndates;
  31. }
  32. publicvoidsetdates(stringdates){
  33. this.dates=dates;
  34. }
  35. publicstringgettags(){
  36. returntags;
  37. }
  38. publicvoidsettags(stringtags){
  39. this.tags=tags;
  40. }
  41. publicstringgetcategory(){
  42. returncategory;
  43. }
  44. publicvoidsetcategory(stringcategory){
  45. this.category=category;
  46. }
  47. publicintgetview(){
  48. returnview;
  49. }
  50. publicvoidsetview(intview){
  51. this.view=view;
  52. }
  53. publicintgetcomments(){
  54. returncomments;
  55. }
  56. publicvoidsetcomments(intcomments){
  57. this.comments=comments;
  58. }
  59. publicintgetcopyright(){
  60. returncopyright;
  61. }
  62. publicvoidsetcopyright(intcopyright){
  63. this.copyright=copyright;
  64. }
  65. publicstringtostring(){
  66. return"csdnblog[key="+key+",title="+title+",content="+content+",dates="+dates+",tags="+tags+",category="
  67. +category+",view="+view+",comments="+comments+",copyright="+copyright+"]";
  68. }
  69. }

启动类:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95
public class csdnblogpageprocessor implements pageprocessor {

private static string username="chenyufeng1991"; // 设置csdn用户名

private static int size = 0;// 共抓取到的文章数量

// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等

private site site = site.me().setretrytimes(3).setsleeptime(1000);

public site getsite() {

return site;

}

// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑

public void process(page page) {

// 列表页

if (!page.geturl().regex("http://blog.csdn.net/" + username + "/article/details/d+").match()) {

// 添加所有文章页

page.addtargetrequests(page.gethtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域

.regex("/" + username + "/article/details/d+")

.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url

.all());

// 添加其他列表页

page.addtargetrequests(page.gethtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域

.regex("/" + username + "/article/list/d+")

.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url

.all());

// 文章页

} else {

size++;// 文章数量加1

// 用csdnblog类来存抓取到的数据,方便存入数据库

csdnblog csdnblog = new csdnblog();

// 设置编号

csdnblog.setkey(integer.parseint(

page.geturl().regex("http://blog.csdn.net/" + username + "/article/details/(d+)").get()));

// 设置标题

csdnblog.settitle(

page.gethtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get());

//设置内容

csdnblog.setcontent(

page.gethtml().xpath("//div[@class='article_content']/alltext()").get());

// 设置日期

csdnblog.setdates(

page.gethtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get());

// 设置标签(可以有多个,用,来分割)

csdnblog.settags(listtostring(page.gethtml().xpath("//div[@class='article_l']/span[@class='link_categories']/a/alltext()").all()));

// 设置类别(可以有多个,用,来分割)

csdnblog.setcategory(listtostring(page.gethtml().xpath("//div[@class='category_r']/label/span/text()").all()));

// 设置阅读人数

csdnblog.setview(integer.parseint(page.gethtml().xpath("//div[@class='article_r']/span[@class='link_view']")

.regex("(d+)人阅读").get()));

// 设置评论人数

csdnblog.setcomments(integer.parseint(page.gethtml()

.xpath("//div[@class='article_r']/span[@class='link_comments']").regex("((d+))").get()));

// 设置是否原创

csdnblog.setcopyright(page.gethtml().regex("bog_copyright").match() ? 1 : 0);

// 把对象存入数据库

new csdnblogdao().add(csdnblog);

// 把对象输出控制台

system.out.println(csdnblog);

}

}

// 把list转换为string,用,分割

public static string listtostring(list<string> stringlist) {

if (stringlist == null) {

return null;

}

stringbuilder result = new stringbuilder();

boolean flag = false;

for (string string : stringlist) {

if (flag) {

result.append(",");

} else {

flag = true;

}

result.append(string);

}

return result.tostring();

}

public static void main(string[] args) {

long starttime, endtime;

system.out.println("【爬虫开始】...");

starttime = system.currenttimemillis();

// 从用户博客首页开始抓,开启5个线程,启动爬虫

spider.create(new csdnblogpageprocessor()).addurl("http://blog.csdn.net/" + username).thread(5).run();

endtime = system.currenttimemillis();

system.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endtime - starttime) / 1000) + "秒,已保存到数据库,请查收!");

}

}

使用mysql类型:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47
public class gamepageprocessor implements pageprocessor {

private static final logger logger = loggerfactory.getlogger(gamepageprocessor.class);

private static dianjingservice d;

private static bannerservice bs;

private static sportservice ss;

private static yulenewsservice ys;

private static updateservice ud ;

// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等

private site site = site.me().setretrytimes(3).setsleeptime(1000);

public site getsite() {

return site;

}

// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑

public static void main(string[] args) {

configurableapplicationcontext context= springapplication.run(gamepageprocessor.class, args);

d = context.getbean(dianjingservice.class);

//spider.create(new gamepageprocessor()).addurl("网址").thread(5).run();

}

public void process(page page) {

selectable url = page.geturl();

if (url.tostring().equals("网址")) {

dianjingvideo dv = new dianjingvideo();

list<string> ls = page.gethtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()").all();

//hrefs

list<string> ls1 = page.gethtml().xpath("//div[@class='v']/div[@class='v-link']/a/@href").all();//获取a标签的href

list<string> ls2 = page.gethtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()").all();

//photo

list<string> ls3 = page.gethtml().xpath("//div[@class='v']/div[@class='v-thumb']/img/@src").all();

for (int i = 0; i < 5; i++) {

dv.settitles(ls.get(i));

dv.setcategory("");

dv.setdates(ls2.get(i));

dv.sethrefs(ls1.get(i));

dv.setphoto(ls3.get(i));

dv.setsources("");

d.addvideo(dv);

}

}

}

controller:

  1. @controller
  2. @requestmapping(value="/dianjing")
  3. publicclassdianjingcontroller{
  4. @autowired
  5. privatedianjingservices;
  6. /*
  7. 手游
  8. */
  9. @requestmapping("/dianjing")
  10. @responsebody
  11. publicobjectdianjing(){
  12. list<dianjing>list=s.find2();
  13. jsonobjectjo=newjsonobject();
  14. if(list!=null){
  15. jo.put("code",0);
  16. jo.put("success",true);
  17. jo.put("count",list.size());
  18. jo.put("list",list);
  19. }
  20. returnjo;
  21. }
  22. }

实体类就不展示了

dao层

?

1

2
@insert("insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})")

int adddj(dianjing dj);

以上这篇springboot+webmagic实现java爬虫jdbc及mysql的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持快网idc。

原文链接:https://www.cnblogs.com/NCL–/p/8608336.html

收藏 (0) 打赏

感谢您的支持,我会继续努力的!

打开微信/支付宝扫一扫,即可进行扫码打赏哦,分享从这里开始,精彩与您同在
点赞 (0)

声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。

快网idc优惠网 建站教程 springboot+webmagic实现java爬虫jdbc及mysql的方法 https://www.kuaiidc.com/111172.html

相关文章

发表评论
暂无评论