如何优雅地使用c语言编写爬虫_软件运维

前言

大家在平时或多或少地都会有编写网络爬虫的需求。一般来说，编写爬虫的首选自然非python莫属，除此之外，java等语言也是不错的选择。选择上述语言的原因不仅仅在于它们均有非常不错的网络请求库和字符串处理库，还在于基于上述语言的爬虫框架非常之多和完善。良好的爬虫框架可以确保爬虫程序的稳定性，以及编写程序的便捷性。所以，这个cspider爬虫库的使命在于，我们能够使用c语言，依然能够优雅地编写爬虫程序。

爬虫的特性

配置方便。使用一句设置函数，即可定义user agent，cookie，timeout，proxy以及抓取线程和解析线程的最大数量。

程序逻辑独立。用户可以分别定义爬虫的解析函数，和数据持久化函数。并且对于解析到的新url，用户可以使用cspider提供的addUrl函数，将其加入到任务队列中。

便捷的字符串处理。cspider中提供了基于pcre的简单的正则表达式函数，基于libxml2的xpath解析函数，以及用于解析json的cJSON库。

高效的抓取。cspider基于libuv调度抓取线程和解析线程，使用curl作为其网络请求库。

使用cspider的步骤

获取cspider_t。

自定义user agent，cookie，timeout，proxy以及抓取线程和解析线程的最大数量。

添加初始要抓取的url到任务队列。

编写解析函数和数据持久化函数。

启动爬虫。

例子

先来看下简单的爬虫例子，会在后面详细讲解例子。

#include<cspider/spider.h>

自定义的解析函数，d为获取到的html页面字符串

void p(cspider_t *cspider, char *d, void *user_data) {

char *get[100]

//xpath解析html

int size = xpath(d, "//body/div[@class='wrap']/div[@class='sort-column area']/div[@class='column-bd cfix']/ul[@class='st-list cfix']/li/strong/a", get， 100)

int i

for (i = 0i <sizei++) {

//将获取到的电影名称，持久化

saveString(cspider, get[i])

}

数据持久化函数，对上面解析函数中调用的saveString()函数传入的数据，进行进一步的保存

void s(void *str, void *user_data) {

char *get = (char *)str

FILE *file = (FILE*)user_data

fprintf(file, "%s\n", get)

return

}

int main() {

//初始化spider

cspider_t *spider = init_cspider()

char *agent = "Mozilla/5.0 (MacintoshIntel Mac OS X 10.10rv:42.0) Gecko/20100101 Firefox/42.0"

//char *cookie = "bid=s3/yuH5Jd/Ill=108288viewed=1130500_24708145_6433169_4843567_1767120_5318823_1899158_1271597__utma=30149280.927537245.1446813674.1446983217.1449139583.4__utmz=30149280.1449139583.4.4.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/loginps=yue=965166527@qq.comdbcl2=58742090:QgZ2PSLiDLQck=T9Wnpush_noty_num=0push_doumail_num=7ap=1__utmb=30149280.0.10.1449139583__utmc=30149280"

//设置要抓取页面的url

cs_setopt_url(spider, "so.tv.sohu.com/list_p1100_p20_p3_u5185_u5730_p40_p5_p6_p77_p80_p9_2d1_p101_p11.html")

//设置user agent

cs_setopt_useragent(spider, agent)

//cs_setopt_cookie(spider, cookie)

//传入解析函数和数据持久化函数的指针

cs_setopt_process(spider, p, NULL)

//s函数的user_data指针指向stdout

cs_setopt_save(spider, s, stdout)

//设置线程数量

cs_setopt_threadnum(spider, DOWNLOAD, 2)

cs_setopt_threadnum(spider, SAVE, 2)

//FILE *fp = fopen("log", "wb+")

//cs_setopt_logfile(spider, fp)

//开始爬虫

return cs_run(spider)

}

例子讲解

cspider_t *spider = init_cspider()获取初始的cspider。cs_setopt_xxx这类函数可以用来进行初始化设置。其中要注意的是: cs_setopt_process(spider,p,NULL)与cs_setopt_save(spider,s,stdout)，它们分别设置了解析函数p和数据持久化函数s，这两个函数需要用户自己实现，还有用户自定义的指向上下文信息user_data的指针。

在解析函数中，用户要定义解析的规则，并对解析得到的字符串可以调用saveString进行持久化，或者是调用addUrl将url加入到任务队列中。在saveString中传入的字符串会在用户自定义的数据持久函数中得到处理。此时，用户可以选择输出到文件或数据库等。

最后调用cs_run(spider)即可启动爬虫。

具体的API参数可在这里查看

总结

赶快使用cspider爬虫框架来编写爬虫吧！如果在使用过程中发现bug，欢迎反馈。

望采纳，谢谢

获取cspider_t。

自定义user agent，cookie，timeout，proxy以及抓取线程和解析线程的最大数量。

添加初始要抓取的url到任务队列。

编写解析函数和数据持久化函数。

启动爬虫。

例子

先来看下简单的爬虫例子，会在后面详细讲解例子。

#include<cspider/spider.h>

自定义的解析函数，d为获取到的html页面字符串

void p(cspider_t *cspider, char *d, void *user_data) {

char *get[100]

//xpath解析html

int size = xpath(d, "//body/div[@class='wrap']/div[@class='sort-column area']/div[@class='column-bd cfix']/ul[@class='st-list cfix']/li/strong/a", get， 100)

int i

for (i = 0i <sizei++) {

//将获取到的电影名称，持久化

saveString(cspider, get[i])

}

数据持久化函数，对上面解析函数中调用的saveString()函数传入的数据，进行进一步的保存

void s(void *str, void *user_data) {

char *get = (char *)str

FILE *file = (FILE*)user_data

fprintf(file, "%s\n", get)

return

}

int main() {

//初始化spider

cspider_t *spider = init_cspider()

char *agent = "Mozilla/5.0 (MacintoshIntel Mac OS X 10.10rv:42.0) Gecko/20100101 Firefox/42.0"

//设置要抓取页面的url

cs_setopt_url(spider, "so.tv.sohu.com/list_p1100_p20_p3_u5185_u5730_p40_p5_p6_p77_p80_p9_2d1_p101_p11.html")

//设置user agent

cs_setopt_useragent(spider, agent)

//cs_setopt_cookie(spider, cookie)

//传入解析函数和数据持久化函数的指针

cs_setopt_process(spider, p, NULL)

//s函数的user_data指针指向stdout

cs_setopt_save(spider, s, stdout)

//设置线程数量

cs_setopt_threadnum(spider, DOWNLOAD, 2)

cs_setopt_threadnum(spider, SAVE, 2)

//FILE *fp = fopen("log", "wb+")

//cs_setopt_logfile(spider, fp)

//开始爬虫

return cs_run(spider)

}

要想爬，先读取HTTP

读取后进行字符串分割

Http访问有两种方式，GET和POST，就编程来说GET方式相对简单点，它不用向服务器提交数据，程序中使用POST方式，

提交数据并从服务器获取返回值。

为实现Http访问，微软提供了二套API：WinINet, WinHTTP。WinHTTP比WinINet更加安全和健壮，可以这么认为WinHTTP是WinINet的升级版本。

程序中，通过一个宏的设置来决定是使用WinHttp还是WinINet。

#define USE_WINHTTP //Comment this line to user wininet.

下面来说说实现Http访问的流程（两套API都一样的流程）：

1，首先我们打开一个Session获得一个HINTERNET session句柄；

2，然后我们使用这个session句柄与服务器连接得到一个HINTERNET connect句柄；

3，然后我们使用这个connect句柄来打开Http 请求得到一个HINTERNET request句柄；

4，这时我们就可以使用这个request句柄来发送数据与读取从服务器返回的数据；

5，最后依次关闭request，connect，session句柄。

/***********************定义HTTP发送所用方法***********************************/

HINTERNET OpenSession(LPCWSTR userAgent = 0)

{

#ifdef USE_WINHTTP

return WinHttpOpen(userAgent, NULL, NULL, NULL, NULL)

#else

return InternetOpen(userAgent, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0)

#endif

}

HINTERNET Connect(HINTERNET hSession, LPCWSTR serverAddr, int portNo)

{

#ifdef USE_WINHTTP

return WinHttpConnect(hSession, serverAddr, (INTERNET_PORT) portNo, 0)

#else

return InternetConnect(hSession, serverAddr, portNo, NULL, NULL, INTERNET_SERVICE_HTTP, 0, 0)

#endif

}

HINTERNET OpenRequest(HINTERNET hConnect, LPCWSTR verb, LPCWSTR objectName, int scheme)

{

DWORD flags = 0

#ifdef USE_WINHTTP

if (scheme == INTERNET_SCHEME_HTTPS) {

flags |= WINHTTP_FLAG_SECURE

}

return WinHttpOpenRequest(hConnect, verb, objectName, NULL, NULL, NULL, flags)

#else

if (scheme == INTERNET_SCHEME_HTTPS) {

flags |= INTERNET_FLAG_SECURE

}

return HttpOpenRequest(hConnect, verb, objectName, NULL, NULL, NULL, flags, 0)

#endif

}

BOOL AddRequestHeaders(HINTERNET hRequest, LPCWSTR header)

{

SIZE_T len = lstrlenW(header)

#ifdef USE_WINHTTP

return WinHttpAddRequestHeaders(hRequest, header, DWORD(len), WINHTTP_ADDREQ_FLAG_ADD)

#else

return HttpAddRequestHeaders(hRequest, header, DWORD(len), HTTP_ADDREQ_FLAG_ADD)

#endif

}

BOOL SendRequest(HINTERNET hRequest, const void* body, DWORD size)

{

#ifdef USE_WINHTTP

return WinHttpSendRequest(hRequest, 0, 0, const_cast<void*>(body), size, size, 0)

#else

return HttpSendRequest(hRequest, 0, 0, const_cast<void*>(body), size)

#endif

}

BOOL EndRequest(HINTERNET hRequest)

{

#ifdef USE_WINHTTP

return WinHttpReceiveResponse(hRequest, 0)

#else

// if you use HttpSendRequestEx to send request then use HttpEndRequest in here!

return TRUE

#endif

}

BOOL QueryInfo(HINTERNET hRequest, int queryId, char* szBuf, DWORD* pdwSize)

{

#ifdef USE_WINHTTP

return WinHttpQueryHeaders(hRequest, (DWORD) queryId, 0, szBuf, pdwSize, 0)

#else

return HttpQueryInfo(hRequest, queryId, szBuf, pdwSize, 0)

#endif

}

BOOL ReadData(HINTERNET hRequest, void* buffer, DWORD length, DWORD* cbRead)

{

#ifdef USE_WINHTTP

return WinHttpReadData(hRequest, buffer, length, cbRead)

#else

return InternetReadFile(hRequest, buffer, length, cbRead)

#endif

}

void CloseInternetHandle(HINTERNET hInternet)

{

if (hInternet)

{

#ifdef USE_WINHTTP

WinHttpCloseHandle(hInternet)

#else

InternetCloseHandle(hInternet)

#endif

}

/**********************************************************/

///通过Http方式发送短信

string SendSMS_HTTP(const long ececcid,const string & password ,const string & msisdn, const string &smsContent)

{

string rtnStr ="-1"

HINTERNET hSession = 0

HINTERNET hConnect = 0

HINTERNET hRequest = 0

wstring strHeader(L"Content-type: application/x-www-form-urlencoded\r\n")

// Test data

CrackedUrl crackedUrl(L"http://pi.f3.cn/SendSMS.aspx")

string StrPostData = "ececcid=600000&password="+password+"&msisdn="+msisdn+"&smscontent="+smsContent+"&msgtype=5&longcode="

StrPostData = string_To_UTF8(StrPostData)

// Open session.

hSession = OpenSession(L"HttpPost by lyz_sea@163.com")

if (hSession == NULL) {

cout<<"Error:Open session!\n"

return "-1"

}

// Connect.

hConnect = Connect(hSession, crackedUrl.GetHostName(), crackedUrl.GetPort())

if (hConnect == NULL) {

cout<<"Error:Connect failed!\n"

return "-1"

}

// Open request.

hRequest = OpenRequest(hConnect, L"POST", crackedUrl.GetPath(), crackedUrl.GetScheme())

if (hRequest == NULL) {

cout<<"Error:OpenRequest failed!\n"

return "-1"

}

// Add request header.

if (!AddRequestHeaders(hRequest, strHeader.c_str())) {

cout<<"Error:AddRequestHeaders failed!\n"

return "-1"

}

// Send post data.

if (!SendRequest(hRequest, StrPostData.c_str(), StrPostData.length())) {

cout<<"Error:SendRequest failed!\n"

return "-1"

}

// End request

if (!EndRequest(hRequest)) {

cout<<"Error:EndRequest failed!\n"

return "-1"

}

char szBuf[BUF_SIZE]

DWORD dwSize = 0

szBuf[0] = 0

// Query header info.

#ifdef USE_WINHTTP

int contextLengthId = WINHTTP_QUERY_CONTENT_LENGTH

int statusCodeId = WINHTTP_QUERY_STATUS_CODE

int statusTextId = WINHTTP_QUERY_STATUS_TEXT

#else

int contextLengthId = HTTP_QUERY_CONTENT_LENGTH

int statusCodeId = HTTP_QUERY_STATUS_CODE

int statusTextId = HTTP_QUERY_STATUS_TEXT

#endif

dwSize = BUF_SIZE

if (QueryInfo(hRequest, contextLengthId, szBuf, &dwSize)) {

szBuf[dwSize] = 0

cout<<"Content length: "<<szBuf<<endl

}

dwSize = BUF_SIZE

if (QueryInfo(hRequest, statusCodeId, szBuf, &dwSize)) {

szBuf[dwSize] = 0

cout<<"Status code: "<< szBuf<<endl

}

dwSize = BUF_SIZE

if (QueryInfo(hRequest, statusTextId, szBuf, &dwSize)) {

szBuf[dwSize] = 0

cout<<"Status text:"<<szBuf<<endl

}

// read data.

for () {

dwSize = BUF_SIZE

if (ReadData(hRequest, szBuf, dwSize, &dwSize) == FALSE) {

break

}

if (dwSize <= 0) {

break

}

szBuf[dwSize] = 0

rtnStr =::UTF8_To_string(string(szBuf))

cout<<rtnStr<<endl//Output 返回值

}

CloseInternetHandle(hRequest)

CloseInternetHandle(hConnect)

CloseInternetHandle(hSession)

return rtnStr

}

以上方法中用到的CrackURL方法在以下CrackURL.h文件中：

#pragma once

//#include<iostream>

//using namespace std

#define USE_WINHTTP //Comment this line to user wininet.

#ifdef USE_WINHTTP

#include <winhttp.h>

#pragma comment(lib, "winhttp.lib")

#else

#include <wininet.h>

#pragma comment(lib, "wininet.lib")

#endif

// CrackedUrl

class CrackedUrl {

int m_scheme

wstring m_host

int m_port

wstring m_path

public:

CrackedUrl(LPCWSTR url)

{

URL_COMPONENTS uc = { 0}

uc.dwStructSize = sizeof(uc)

const DWORD BUF_LEN = 256

WCHAR host[BUF_LEN]

uc.lpszHostName = host

uc.dwHostNameLength = BUF_LEN

WCHAR path[BUF_LEN]

uc.lpszUrlPath = path

uc.dwUrlPathLength = BUF_LEN

WCHAR extra[BUF_LEN]

uc.lpszExtraInfo = extra

uc.dwExtraInfoLength = BUF_LEN

#ifdef USE_WINHTTP

if (!WinHttpCrackUrl(url, 0, ICU_ESCAPE, &uc)) {

cout<<"Error:WinHttpCrackUrl failed!\n"

}

#else

if (!InternetCrackUrl(url, 0, ICU_ESCAPE, &uc)) {

printf("Error:InternetCrackUrl failed!\n")

}

#endif

m_scheme = uc.nScheme

m_host = host

m_port = uc.nPort

m_path = path

}

int GetScheme() const

{

return m_scheme

}

LPCWSTR GetHostName() const

{

return m_host.c_str()

}

int GetPort() const

{

return m_port

}

LPCWSTR GetPath() const

{

return m_path.c_str()

}

static string UrlEncode(const char* p)

{

if (p == 0) {

return string()

}

string buf

for () {

int ch = (BYTE) (*(p++))

if (ch == '\0') {

break

}

if (isalnum(ch) || ch == '_' || ch == '-' || ch == '.') {

buf += (char)ch

}

else if (ch == ' ') {

buf += '+'

}

else {

char c[16]

wsprintfA(c, "%%%02X", ch)

buf += c

}

return buf

}

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/yw/12003172.html

如何优雅地使用c语言编写爬虫

发表评论

评论列表（0条）