代码如下:
package com.tracker.offline.tools;import com.alibaba.fastJson.JsONObject;import com.Google.common.collect.Lists;import com.tracker.common.utils.StringUtil;import com.tracker.coprocessor.utils.JsonUtil;import org.apache.commons.lang.StringUtils;import org.Jsoup.Connection;import org.Jsoup.Jsoup;import org.Jsoup.nodes.document;import java.io.BufferedReader;import java.io.BuffereDWriter;import java.io.file;import java.io.fileReader;import java.io.fileWriter;import java.io.IOException;import java.util.List;import java.util.Map;/** * 文件名:爬取页面上的数据 */public class SpIDerUserPosTag { private static List<Integer> IDList = Lists.newArrayList(113717565,113856580); private static final String url="http://192.168.202.17:8080/business/job51_jobstats/actions/jobshow_List"; private static final String output="E:\result.tsv"; public String getDataFromWeb (String ID) throws IOException { document response = Jsoup.connect(url).timeout(12 * 1000).userAgent("Mozilla/5.0").method(Connection.Method.POST) .ignoreContentType(true) .cookie("JsESSIONID","986C7BA4E6FE3DB5C4591F3481D3FF1D") .header("Content-Type","application/Json;charset=UTF-8") .data("a","b") .requestbody("{\"startTime\":\"20190624\",\"endTime\":\"20190627\",\"seType\":\"2\",\"IDType\":\"1\",\"timeType\":\"1\",\"startIndex\":1,\"offset\":50,\"ID\":"+ID+"}") .post(); return response.text(); } public static voID main(String[] args) throws Exception{ SpIDerUserPosTag sp=new SpIDerUserPosTag(); int n=0; int start=898440; BuffereDWriter bw=new BuffereDWriter(new fileWriter(new file(output),true)); try { for (Integer ID:IDList) {
//返回数据转化和解析,Map<String,List<Map<String,String>>> String line = sp.getDataFromWeb(String.valueOf(ID)); Map<String,String> maps = JsonUtil.parseJsON2MapStr(line); String str2 = maps.get("result"); List<String> Lists = JsONObject.parseArray(str2,String.class); for (String str3:Lists) { Map<String,String> maps2 = JsonUtil.parseJsON2MapStr(str3); bw.write(StringUtil.joinString("\t",maps2.get("jobID"),maps2.get("jobname"),maps2.get("totalShowCount"),maps2.get("totalClickCount"),maps2.get("totalApplyCount"),maps2.get("time"),maps2.get("webShowCount"),maps2.get("webClickCount"),maps2.get("webApplyCount"),maps2.get("appShowCount"),maps2.get("appClickCount"),maps2.get("appApplyCount"),maps2.get("mShowCount"),maps2.get("mClickCount"),maps2.get("mApplyCount"),maps2.get("showCount"),maps2.get("clickCount"),maps2.get("applyCount"))+"\n"); } } bw.flush(); bw.close(); } catch (IOException e){ e.printstacktrace(); } }}
需要确定的三个元素:
url:
cookeID 和 请求body的格式:
返回参数:
总结以上是内存溢出为你收集整理的爬取网页数据基础全部内容,希望文章能够帮你解决爬取网页数据基础所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)