龙空技术网

C实现网页信息捕获

爱音乐的程序员小新人 123

前言:

目前你们对“c语言读取网页内容”大约比较珍视,姐妹们都需要剖析一些“c语言读取网页内容”的相关文章。那么小编同时在网上网罗了一些有关“c语言读取网页内容””的相关内容,希望我们能喜欢,姐妹们快快来了解一下吧!

经常听别人说通过Python可以轻松实现一个小爬虫,甚至会有部分同学一听到爬虫就认为是Python,我竟无言以对。为了先从自己开始打破这种思想禁锢,花了一天时间查找资料了解网络方面知识,用C程序接收数据。程序框架借鉴于他人。做出来的只是Demo,欢迎批评指正。

/****************************************************************** Copyright (C) 2018 XZD. All rights reserved.* * File name: networm.c* Author: xiaozude* Version: 1.0.0* Data: 2018-07-21* Description: Linux平台下,用gcc编译后,通过程序+网址方式调用。******************************************************************/ #include <stdbool.h>#include <stdio.h>#include <string.h>#include <netdb.h> #define MAX_STR_LEN 1024#define PORT 80 /* 获取主机路径和网页路径 */_Bool GetUrlAndPath(const char * url, char * hostUrl, char * pagePath){ strcpy(hostUrl, url); //去除"https://"与"http://" if (0 == strncmp(hostUrl, "https://", 8)) { int i = 8; do { hostUrl[i - 8] = hostUrl[i]; } while ('\0' != hostUrl[i++]); } else if (0 == strncmp(hostUrl, "http://", 7)) { int i = 7; do { hostUrl[i - 7] = hostUrl[i]; } while ('\0' != hostUrl[i++]); } //获取主机路径和网页路径 char * str = strchr(hostUrl, '/'); if (NULL != str) { strcpy(pagePath, str); int i = 0; while ('/' != hostUrl[i]) { i++; } hostUrl[i] = '\0'; } else { strcpy(pagePath, "/"); } return true;} /* 获得请求头部 */_Bool GetRequestHeader(const char * hostUrl, const char * pagePath, char * requestHeader){ strcpy(requestHeader, "GET "); strcat(requestHeader, pagePath); strcat(requestHeader, " HTTP/1.1\r\n"); strcat(requestHeader, "Host: "); strcat(requestHeader, hostUrl); strcat(requestHeader, "\r\n"); strcat(requestHeader, "Assept: */*\r\n"); strcat(requestHeader, "User-Agent: Mozilla/4.0(compatible)\r\n"); strcat(requestHeader, "connection:Keep-Alive\r\n"); strcat(requestHeader, "\r\n"); return true;} /* 网络信息查询 */_Bool GetHostByName(const char * hostUrl, struct hostent ** host){ *host = gethostbyname(hostUrl); if (NULL == *host) { printf("gethostbyname error\n"); return false; } return true;} /* 获得通用地址 */_Bool GetSockaddr(const struct hostent * host, struct sockaddr_in * addr){ addr->sin_family = AF_INET; addr->sin_port = htons(PORT); addr->sin_addr.s_addr = ((struct in_addr *)(host->h_addr))->s_addr; return true;} /* 创建套接字 */_Bool GetSocketFile(int * sfd){ *sfd = socket(AF_INET, SOCK_STREAM, 0); if (-1 == *sfd) { printf("socket error\n"); return false; } return true;} /* 设置套接口 */_Bool SetSockopt(int sfd){ struct timeval timeout = {1, 0}; setsockopt(sfd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); return true;} /* 获取网页数据 */_Bool GetPageContent(int sfd, struct sockaddr_in addr, const struct hostent * host, const char * requestHeader){ //实现TCP握手 int ret = connect(sfd, (struct sockaddr *)&addr, sizeof(addr)); if (-1 == ret) { printf("connect error\n"); return false; } //发送TCP数据 ret = send(sfd, requestHeader, strlen(requestHeader), 0); if (-1 == ret) { printf("send error\n"); return false; } //接收TCP数据 _Bool flag = true; char ch = '\0'; while (0 != recv(sfd, &ch, 1, 0)) { if ('\r' == ch) { continue; } else if ('\n' == ch) { if (!flag) { break; } flag = false; } else { flag = true; } } //循环获取网页数据 char buffer[MAX_STR_LEN] = ""; int len = 0; while((len = recv(sfd, buffer, MAX_STR_LEN - 1, 0)) > 0) { buffer[len] = '\0'; printf("%s\n", buffer); //输出网页数据 } return true;} int main(int argc, char * argv[]){ char hostUrl[MAX_STR_LEN / 2] = ""; char pagePath[MAX_STR_LEN / 2] = ""; char requestHeader[MAX_STR_LEN] = ""; struct hostent * host = NULL; struct sockaddr_in addr; int sfd = -1; GetUrlAndPath(argv[1], hostUrl, pagePath); GetRequestHeader(hostUrl, pagePath, requestHeader); GetHostByName(hostUrl, &host); GetSockaddr(host, &addr); GetSocketFile(&sfd); SetSockopt(sfd); GetPageContent(sfd, addr, host, requestHeader); return 0;}

标签: #c语言读取网页内容 #c获取html内容 #网页获取flag