当我使用jsoup或htmlunit获取页面时,href字段丢失

当我使用jsoup或htmlunit获取页面时,href字段丢失,第1张

当我使用jsoup或htmlunit获取页面时,href字段丢失

对于每个搜索结果,都有一个

<div >
包含JSON对象的对象,该对象还包含url。使用类似于json-
simple
的JSON解析器来解析对象,以下代码将打印图像网址:

String searchTerm = "naruto shippuden";String searchUrl = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&biw=1920&bih=955&q=" + searchTerm.replace(" ", "+") + "&gws_rd=cr";try {    document doc = Jsoup.connect(searchUrl) .userAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36") .referrer("https://www.google.com/").get();    JSonObject obj;    for (Element result : doc.select("div.rg_meta")) {        // div.rg_meta contains a JSON object, which also holds the image url        obj = (JSONObject) new JSonParser().parse(result.text());        String imageUrl = (String) obj.get("ou");        // just printing out the url to demonstate the approach        System.out.println("imageUrl: " + imageUrl);        }} catch (IOException e1) {    e1.printStackTrace();}catch (ParseException e) {    e.printStackTrace();}

输出:

imageUrl: http://ib3.huluim.com/show_key_art/1603?size=1600x600&region=USimageUrl: http://cdn.zonarutoppuden.com/ns/peliculas-naruto-shippuden.jpgimageUrl: http://www.saiyanisland.com/news/wp-content/uploads2/2014/12/Naruto-Sasuke.jpg...

更新资料

由于jsAction似乎不能很好地与htmlUnit配合使用,因此我建议使用phantomJs。只需为您的 *** 作系统下载二进制文件并创建一个脚本文件。

创建一个

page.js
文件:

var page = require('webpage').create();var fs = require('fs');page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';page.zoomFactor = 0.1;page.viewportSize = {  width: 1920,  height: 1080};var divCount="-1";var topPosition=0;var unchangedCounter=0;page.open('https://www.google.com/search?site=imghp&tbm=isch&source=hp&q=naruto+shippuden&gws_rd=cr', function(status) {    console.log("Status: " + status);    if(status === "success") {        window.setInterval(function() { var newDivCount = page.evaluate(function() {      var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");     return divs[divs.length-1].getAttribute("data-ri"); }); topPosition = topPosition + 1080; page.scrollPosition = {     top: topPosition,     left: 0 }; if(newDivCount===divCount){     page.evaluate(function() {         var button = document.querySelector("#smb");         console.log("buttontype:"+typeof button);         if(!(typeof button === "undefined")) {  button.click();  return true;         }else{  return false;         }     });     if(unchangedCounter===5){         console.log(newDivCount);         var path = 'output.html';         fs.write(path, page.content, 'w');         phantom.exit();     }else{         unchangedCounter=unchangedCounter+1;     } }else{     unchangedCounter=0; } divCount = newDivCount;        }, 500);    }});

现在,我们使用phantomJs执行脚本文件,并使用jsoup解析结果:

try {    Process process = Runtime.getRuntime().exec("bin\phantomjs page.js"); //change path to phantomjs binary and your script file    process.waitFor();    document doc = Jsoup.parse(new File("output.html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js    for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {        System.out.println(element.attr("href"));    }    System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());} catch (IOException | InterruptedException e) {    e.printStackTrace();}

输出:

/imgres?imgurl=http%3A%2F%2Fib3.huluim.com%2Fshow_key_art%2F1603%3Fsize%3D1600x600%26region%3DUS&imgrefurl=http%3A%2F%2Fwww.hulu.com%2Fnaruto-shippuden&docid=OgW4j66rp7CKkM&tbnid=SElXvYDJj9cR6M%3A&w=1600&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwgzKAAwAA&iact=mrc&uact=8/imgres?imgurl=http%3A%2F%2Fcdn.zonarutoppuden.com%2Fns%2Fpeliculas-naruto-shippuden.jpg&imgrefurl=http%3A%2F%2Fwww.zonarutoppuden.com%2F2010%2F10%2Fnaruto-shippuden-peliculas.html&docid=JR8NPqKrF3ac_M&tbnid=0EPPOYQcflXkMM%3A&w=900&h=600&bih=10800&biw=19200&ved=0ahUKEwjX2PXmptPOAhULVxoKHXfmDg8QMwg0KAEwAQ&iact=mrc&uact=8...Number of results: 463

更新:将url作为参数传递给脚本

脚本

page.js

var page = require('webpage').create();var fs = require('fs');var system = require('system');var url = "";var searchParameter = "";if (system.args.length === 3) {    url=system.args[1];    searchParameter=system.args[2];}if(url==="" || searchParameter===""){    phantom.exit();}page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36';page.zoomFactor = 0.1;page.viewportSize = {  width: 1920,  height: 1080};var divCount="-1";var topPosition=0;var unchangedCounter=0;page.open(url, function(status) {    console.log("Status: " + status);    if(status === "success") {        window.setInterval(function() { var newDivCount = page.evaluate(function() {      var divs = document.querySelectorAll(".rg_di.rg_bx.rg_el.ivg-i");     return divs[divs.length-1].getAttribute("data-ri"); }); topPosition = topPosition + 1080; page.scrollPosition = {     top: topPosition,     left: 0 }; if(newDivCount===divCount){     page.evaluate(function() {         var button = document.querySelector("#smb");         if(!(typeof button === "undefined")) {  button.click();  return true;         }else{  return false;         }     });     if(unchangedCounter===5){         var path = searchParameter+'.html';         fs.write(path, page.content, 'w');         phantom.exit();     }else{         unchangedCounter=unchangedCounter+1;     } }else{     unchangedCounter=0; } divCount = newDivCount;        }, 500);    }else{        phantom.exit();    }});

Java代码

try {    //change path to phantomjs binary and your script file    String phantomJSPath = "phantomjs" + File.separator + "bin" + File.separator + "phantomjs";    String scriptFile = "page.js";    String searchTerm = "naruto+shippuden";    String urlParameter = "https://www.google.com/search?site=imghp&tbm=isch&source=hp&gws_rd=cr&q="+searchTerm;    Process process = Runtime.getRuntime().exec(phantomJSPath + " " + scriptFile + " " + urlParameter + " " + searchTerm);    process.waitFor();    document doc = Jsoup.parse(new File(searchTerm + ".html"),"UTF-8"); // output.html is created by phantom.js, same path as page.js    for (Element element : doc.select("div.rg_di.rg_bx.rg_el.ivg-i a")) {        System.out.println(element.attr("href"));    }    System.out.println("Number of results: " + doc.select("div.rg_di.rg_bx.rg_el.ivg-i a").size());} catch (IOException | InterruptedException e) {    e.printStackTrace();}


欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/zaji/5032326.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-11-15
下一篇 2022-11-15

发表评论

登录后才能评论

评论列表(0条)

保存