linux c++模拟简易网络爬虫实例
发布时间:2020-12-16 05:09:30 所属栏目:百科 来源:网络整理
导读:废话不多说,直接上代码 /** To change this license header,choose License Headers in Project Properties.* To change this template file,choose Tools | Templates* and open the template in the editor.*//* * File: main.cpp* Author: yangchao**/#i
废话不多说,直接上代码 /* * To change this license header,choose License Headers in Project Properties. * To change this template file,choose Tools | Templates * and open the template in the editor. */ /* * File: main.cpp * Author: yangchao * */ #include <iostream> #include <string> #include <netdb.h> #include <string.h> #include <stdlib.h> using namespace std; void parseHostAndPagePath(const string url,string &hostUrl,string &pagePath){ hostUrl=url; pagePath="/"; int pos=hostUrl.find("http://"); if(-1!=pos) hostUrl=hostUrl.replace(pos,7,""); pos=hostUrl.find("https://"); if(-1!=pos) hostUrl=hostUrl.replace(pos,8,""); pos=hostUrl.find("/"); if(-1!=pos) { pagePath=hostUrl.substr(pos); hostUrl=hostUrl.substr(0,pos); } } string getPageContent(const string url){ struct hostent *host; string hostUrl,pagePath; parseHostAndPagePath(url,hostUrl,pagePath); if(0==(host=gethostbyname(hostUrl.c_str()))) { cout<<"gethostbyname errorn"<<endl; exit(1); } struct sockaddr_in pin; int port=80; bzero(&pin,sizeof(pin)); pin.sin_family=AF_INET; pin.sin_port=htons(port); pin.sin_addr.s_addr=((struct in_addr*)(host->h_addr))->s_addr; int isock; if((isock=socket(AF_INET,SOCK_STREAM,0))==-1) { cout<<"open socket errorn"<<endl; exit(1); } string requestHeader; requestHeader="GET "+pagePath+" HTTP/1.1rn"; requestHeader+="Host: "+hostUrl+"rn"; requestHeader+="Accept: */*rn"; requestHeader+="User-Agent: Mozilla/4.0(compatible)rn"; requestHeader+="connection:Keep-Alivern"; requestHeader+="rn"; if(connect(isock,(const sockaddr*)&pin,sizeof(pin))==-1){ cout<<"connect errorn"<<endl; exit(1); } if(send(isock,requestHeader.c_str(),requestHeader.size(),0)==-1){ cout<<"send errorn"<<endl; exit(1); } struct timeval timeout={1,0}; setsockopt(isock,SOL_SOCKET,SO_RCVTIMEO,(char*)&timeout,sizeof(struct timeval)); char c; bool flag=true; while(recv(isock,&c,1,0)>0){ if('r'==c){ continue; }else if('n'==c){ if(false==flag) break; flag=false; }else{ flag=true; } } int len,BUFFER_SIZE=512; char buffer[BUFFER_SIZE]; string pageContent=""; while((len=recv(isock,buffer,BUFFER_SIZE-1,0))>0){ buffer[len]=' '; pageContent+=buffer; } return pageContent; } int main(int argc,char** argv) { cout<<getPageContent("http://www.hao123.com")<<endl; return 0; } 以上这篇linux c++模拟简易网络爬虫实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持编程小技巧。 (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |