linux c++模拟简易网络爬虫实例

2025-05-27 0 42

废话不多说,直接上代码

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102
/*

* To change this license header, choose License Headers in Project Properties.

* To change this template file, choose Tools | Templates

* and open the template in the editor.

*/

/*

* File: main.cpp

* Author: yangchao

*

*/

#include <iostream>

#include <string>

#include <netdb.h>

#include <string.h>

#include <stdlib.h>

using namespace std;

void parseHostAndPagePath(const string url,string &hostUrl,string &pagePath){

hostUrl=url;

pagePath="/";

int pos=hostUrl.find("http://");

if(-1!=pos)

hostUrl=hostUrl.replace(pos,7,"");

pos=hostUrl.find("https://");

if(-1!=pos)

hostUrl=hostUrl.replace(pos,8,"");

pos=hostUrl.find("/");

if(-1!=pos)

{

pagePath=hostUrl.substr(pos);

hostUrl=hostUrl.substr(0,pos);

}

}

string getPageContent(const string url){

struct hostent *host;

string hostUrl,pagePath;

parseHostAndPagePath(url,hostUrl,pagePath);

if(0==(host=gethostbyname(hostUrl.c_str())))

{

cout<<"gethostbyname error\\n"<<endl;

exit(1);

}

struct sockaddr_in pin;

int port=80;

bzero(&pin,sizeof(pin));

pin.sin_family=AF_INET;

pin.sin_port=htons(port);

pin.sin_addr.s_addr=((struct in_addr*)(host->h_addr))->s_addr;

int isock;

if((isock=socket(AF_INET,SOCK_STREAM,0))==-1)

{

cout<<"open socket error\\n"<<endl;

exit(1);

}

string requestHeader;

requestHeader="GET "+pagePath+" HTTP/1.1\\r\\n";

requestHeader+="Host: "+hostUrl+"\\r\\n";

requestHeader+="Accept: */*\\r\\n";

requestHeader+="User-Agent: Mozilla/4.0(compatible)\\r\\n";

requestHeader+="connection:Keep-Alive\\r\\n";

requestHeader+="\\r\\n";

if(connect(isock,(const sockaddr*)&pin,sizeof(pin))==-1){

cout<<"connect error\\n"<<endl;

exit(1);

}

if(send(isock,requestHeader.c_str(),requestHeader.size(),0)==-1){

cout<<"send error\\n"<<endl;

exit(1);

}

struct timeval timeout={1,0};

setsockopt(isock,SOL_SOCKET,SO_RCVTIMEO,(char*)&timeout,sizeof(struct timeval));

char c;

bool flag=true;

while(recv(isock,&c,1,0)>0){

if('\\r'==c){

continue;

}else if('\\n'==c){

if(false==flag)

break;

flag=false;

}else{

flag=true;

}

}

int len,BUFFER_SIZE=512;

char buffer[BUFFER_SIZE];

string pageContent="";

while((len=recv(isock,buffer,BUFFER_SIZE-1,0))>0){

buffer[len]='\\0';

pageContent+=buffer;

}

return pageContent;

}

int main(int argc, char** argv) {

cout<<getPageContent("http://www.hao123.com")<<endl;

return 0;

}

以上这篇linux c++模拟简易网络爬虫实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持快网idc。

收藏 (0) 打赏

感谢您的支持,我会继续努力的!

打开微信/支付宝扫一扫,即可进行扫码打赏哦,分享从这里开始,精彩与您同在
点赞 (0)

声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。

快网idc优惠网 建站教程 linux c++模拟简易网络爬虫实例 https://www.kuaiidc.com/73418.html

相关文章

发表评论
暂无评论