爬取网页数据基础

爬取网页数据基础,第1张

概述代码如下: package com.tracker.offline.tools;import com.alibaba.fastjson.JSONObject;import com.google.common.collect.Lists;import com.tracker.common.utils.StringUtil;import com.tracker.coprocesso

代码如下:

package com.tracker.offline.tools;import com.alibaba.fastJson.JsONObject;import com.Google.common.collect.Lists;import com.tracker.common.utils.StringUtil;import com.tracker.coprocessor.utils.JsonUtil;import org.apache.commons.lang.StringUtils;import org.Jsoup.Connection;import org.Jsoup.Jsoup;import org.Jsoup.nodes.document;import java.io.BufferedReader;import java.io.BuffereDWriter;import java.io.file;import java.io.fileReader;import java.io.fileWriter;import java.io.IOException;import java.util.List;import java.util.Map;/** * 文件名:爬取页面上的数据 */public class SpIDerUserPosTag {    private static List<Integer> IDList = Lists.newArrayList(113717565,113856580);    private static final String url="http://192.168.202.17:8080/business/job51_jobstats/actions/jobshow_List";    private static final String output="E:\result.tsv";    public String getDataFromWeb (String ID) throws IOException {        document response = Jsoup.connect(url).timeout(12 * 1000).userAgent("Mozilla/5.0").method(Connection.Method.POST)                .ignoreContentType(true)                .cookie("JsESSIONID","986C7BA4E6FE3DB5C4591F3481D3FF1D")                .header("Content-Type","application/Json;charset=UTF-8")                .data("a","b")                .requestbody("{\"startTime\":\"20190624\",\"endTime\":\"20190627\",\"seType\":\"2\",\"IDType\":\"1\",\"timeType\":\"1\",\"startIndex\":1,\"offset\":50,\"ID\":"+ID+"}")                .post();        return response.text();    }    public static voID main(String[] args)  throws Exception{        SpIDerUserPosTag sp=new SpIDerUserPosTag();        int n=0;        int start=898440;        BuffereDWriter bw=new BuffereDWriter(new fileWriter(new file(output),true));        try {            for (Integer ID:IDList) {
          //返回数据转化和解析,Map<String,List<Map<String,String>>> String line
= sp.getDataFromWeb(String.valueOf(ID)); Map<String,String> maps = JsonUtil.parseJsON2MapStr(line); String str2 = maps.get("result"); List<String> Lists = JsONObject.parseArray(str2,String.class); for (String str3:Lists) { Map<String,String> maps2 = JsonUtil.parseJsON2MapStr(str3); bw.write(StringUtil.joinString("\t",maps2.get("jobID"),maps2.get("jobname"),maps2.get("totalShowCount"),maps2.get("totalClickCount"),maps2.get("totalApplyCount"),maps2.get("time"),maps2.get("webShowCount"),maps2.get("webClickCount"),maps2.get("webApplyCount"),maps2.get("appShowCount"),maps2.get("appClickCount"),maps2.get("appApplyCount"),maps2.get("mShowCount"),maps2.get("mClickCount"),maps2.get("mApplyCount"),maps2.get("showCount"),maps2.get("clickCount"),maps2.get("applyCount"))+"\n"); } } bw.flush(); bw.close(); } catch (IOException e){ e.printstacktrace(); } }}

需要确定的三个元素:

url:

cookeID   和  请求body的格式:

返回参数:

总结

以上是内存溢出为你收集整理的爬取网页数据基础全部内容,希望文章能够帮你解决爬取网页数据基础所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/web/1051140.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-25
下一篇 2022-05-25

发表评论

登录后才能评论

评论列表(0条)

保存