怎么用libxml2 默认解析器解析HTML文件_随笔

C程序里可以利用libxml2库去解析xml文档。利用libxml2可以很轻松的解析，生成xml文件。

这里演示一个小例子，包含了遍历节点，获取节点属性与值，以及获取CDATA里面的内容。

测验的xml文件：

<?xml version="1.0" encoding="utf-8"?>

<pro id="moonApple"><![CDATA[<say>i still have lots to work on</say>]]></pro>

<detail name="singing">poor , just listen</detail>

<detail name="eating"><![CDATA[<food>candy</food>]]></detail>

</details>

</content>

</root>

test.c文件：

#include<stdio.h>

#include<string.h>

#include<libxml/parser.h>

#include<libxml/tree.h>

int parse_xml_file(char *buf,int len){

xmlDocPtr doc

xmlNodePtr root,node,detail

xmlChar *name,*value

doc=xmlParseMemory(buf,len)//parse xml in memory

if(doc==NULL){

printf("doc == null\n")

return -1

}

root=xmlDocGetRootElement(doc)

for(node=root->childrennodenode=node->next){

if(xmlStrcasecmp(node->name,BAD_CAST"content")==0)

break

}

if(node==NULL){

printf("no node = content\n")

return -1

}

for(node=node->childrennodenode=node->next){

if(xmlStrcasecmp(node->name,BAD_CAST"pro")==0){ //get pro node

name=xmlGetProp(node,BAD_CAST"id")

value=xmlNodeGetContent(node)

printf("this is %s:\n%s\n",(char*)name,(char*)value)//get value, CDATA is not parse and don't take into value

xmlFree(name)

xmlFree(value)

}else if(xmlStrcasecmp(node->name,BAD_CAST"details")==0){ //get details node

for(detail=node->childrendetaildetail=detail->next){ //traverse detail node

if(xmlStrcasecmp(detail->name,BAD_CAST"detail")==0){

name=xmlGetProp(detail,BAD_CAST"name")

value=xmlNodeGetContent(detail)

if(strlen((char*)value)!=0){

printf("%s : %s\n",(char*)name,(char*)value)

}else{

printf("%s has no value\n",(char*)name)

}

xmlFree(name)

xmlFree(value)

}

xmlFreeDoc(doc)

return 0

}

int main(void){

char *content

unsigned long filesize

FILE *file

if((file=fopen("testxml","r"))==NULL){

perror("openf file error")

}

fseek(file,0,SEEK_END)

filesize=ftell(file)

rewind(file)

content=(char *)malloc(filesize+1)

memset(content,0,filesize+1)

fread(content,1,filesize,file)

fclose(file)

printf("content:\n%s\n",content)

if(parse_xml_file(content,filesize)<0){

perror("parse xml failed")

}

return 0

}

输出结果：

this is moonApple:

<say>i still have lots to work on</say>

dancing : like it

singing : poor , just listen

laugh has no value

eating : <food>candy</food>

这里主要关注XML文件里面的CDATA里面的内容

如果非要用c 写的话，建议用neon库，可以极大减少工作量，如果是在socket层面编成的话，会非常累。

html文本解析就用libxml2库中的html parser即可。

如果没有语言要求，建议用python非常简单。我前段时间花了两天，写了个抓爬flickr信息的东西，然后写进数据库。

使用Objective-C解析HTML或者XML，系统自带有两种方式一个是通过libxml，一个是通过NSXMLParser。不过这两种方式都需要自己写很多编码来处理抓取下来的内容，而且不是很直观。

有一个比较好的类库hpple，它是一个轻量级的包装框架，可以很好的解决这个问题。它是用XPath来定位和解析HTML或者XML。

安装步骤：

-加入 libxml2 到你的项目中

Menu Project->Edit Project Settings

搜索 “Header Search Paths”

添加新的 search path “${SDKROOT}/usr/include/libxml2〃

Enable recursive option

-加入 libxml2 library 到你的项目

Menu Project->Edit Project Settings

搜索 “Other Linker Flags”

添加新的 search flag “-lxml2〃

-将下面hpple的源代码加入到你的项目中:

HTFpple.h

HTFpple.m

HTFppleElement.h

HTFppleElement.m

XPathQuery.h

XPathQuery.m

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/zaji/6258757.html

怎么用libxml2 默认解析器解析HTML文件

发表评论

评论列表（0条）