在工作中,经常做一些有关地区、地址的需求,就是在网页或者App端,展示三级下拉选择省市区。本文旨在帮助我们从国家统计局获取最新的省市区数据用户项目中。
以下代码支持爬虫省市区镇街道,设置有2个全局变量,默认只爬取省市区保存到本地,然后从本地读取爬虫html网页解析成json对象,也可以转成Excel,自己找插件或者写代码转就可以了,非常方便。
首先说说这个爬虫的几个注意顶
- 因为爬虫需要多次与远端服务器连接,并发连接会遇到以下错误,跟代码没有关系,跟网络有关系,连接的时候多次重定向导致,没花过多时间研究解决办法,目前等过一会再次执行就可以了。有好的解决方法请在评论区评论一下,感激。
java.io.IOException: Too many redirects occurred trying to load URL
2.如果在读本地html网页文件的时候报错,FileNotFoundExceiption,说命爬下来的网页文件不完整,缺失了一部分,那么可以将根目录下的region文件夹删除,下次启动程序,会再次自从从远端爬取html网页文件写入到本地。
3.如果在日志中遇到,连接失败,正在重试。就是遇到了上边 说的 发生了多次重定向问题,这样写入到本地的网页文件可能不完整,如果再次启动程序,成功转出json,需要开发者自行检查转出的json数据是否完整。
package com.lockie.region; import com.google.common.base.Charsets; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.lockie.region.entity.City; import com.lockie.region.entity.County; import com.lockie.region.entity.Province; import com.lockie.region.entity.Street; import com.lockie.region.entity.Town; import com.lockie.region.enums.AreaLevelEnum; import com.lockie.region.enums.OperationType; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Optional; import java.util.concurrent.CountDownLatch; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.linkedBlockingQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor.AbortPolicy; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpStatus; import org.jsoup.Connection; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.helper.HttpConnection; import org.jsoup.nodes.document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.helpers.MessageFormatter; @Slf4j public class RegionTask { public static final String base_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/"; public static final String AREA_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html"; private static Boolean isRetry = false; private static final Integer TIMEOUT = 500000; private static final String REGION = "./region/"; private static Integer COUNT = 0; private static final File file; private static final File flagFile; public static final Gson gson = new GsonBuilder().create(); public static final String HTML = ".html"; public static final Boolean IS_GET_STREET = false; public static final Boolean IS_GET_TOEN = false; public static final String A = "a"; public static final String TDA = "td a"; public static final String HREF = "href"; public static final String TEMPLATE = "table.{}table tbody tr.{}tr "; public static final String TD = "td"; public static final String FLAG = "region.txt"; public static final CountDownLatch write = new CountDownLatch(1); static { file = new File(REGION); flagFile = new File(REGION.concat(FLAG)); try { if(!file.exists()){ file.mkdirs(); } if(!file.exists()){ flagFile.createNewFile(); } } catch (IOException e) { e.printStackTrace(); } } public static ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("thread-pool-%d --- ").build(); public static ExecutorService threadPool = new ThreadPoolExecutor(50, 200, 0L, TimeUnit.SECONDS, new linkedBlockingQueue<>(30), threadFactory,new AbortPolicy()); private static Connection getConnection(String u) { Connection connection = Jsoup.connect(u).timeout(TIMEOUT); connection.header(HttpConnection.CONTENT_ENCODING, Charsets.UTF_8.name()); connection.header("Accept", "* private static document getdocument(String url) { return Optional.ofNullable(url).map(u -> { try { Connection connection = getConnection(u); Response execute = null; do { execute = connection.execute(); if (isRetry) { Thread.sleep(2000); connection = getConnection(u); } } while (execute.statusCode() != HttpStatus.SC_OK); return connection.post(); } catch (Exception e) { log.info("无法链接,正在重试~"); } return null; }).orElse(null); } public static void writeToLocal(Element province,CyclicBarrier cyclicBarrier) { try { Elements proSelect = province.select(TDA); String proname = proSelect.text(); if(StringUtils.isNotEmpty(proName)) { String cityUrl = base_URL.concat(proSelect.attr(HREF)); document citydocument = null; do { citydocument = getdocument(cityUrl); if (citydocument == null) { isRetry = true; } else { isRetry = false; } } while (null == citydocument); Elements cities = getCities(citydocument, proName, proSelect.attr(HREF), OperationType.WRITE); for (Element city : cities) { Elements citySelect = city.select(TDA); if (citySelect.size() > 0) { Element cityCodeElement = citySelect.get(0); Element cityNameElement = citySelect.get(1); String cityName = cityNameElement.text(); String countyUrl = cityCodeElement.absUrl(HREF); String fileName = countyUrl.split(base_URL)[1]; document countydocument = null; do { countydocument = getdocument(countyUrl); if (countydocument == null) { isRetry = true; } else { isRetry = false; } } while (null == countydocument); Elements counties = getCounties(countydocument, proName, cityName, fileName,OperationType.WRITE); if (IS_GET_TOEN) { for (Element county : counties) { Elements countySelect = county.select(TDA); if (countySelect.size() > 0) { Element countyCodeElement = countySelect.get(0); Element countyNameElement = countySelect.get(1); String countyName = countyNameElement.text(); String townUrl = countyCodeElement.absUrl(HREF); String countyFileName = townUrl.split(base_URL)[1]; document towndocument = null; do { towndocument = getdocument(townUrl); if (towndocument == null) { isRetry = true; } else { isRetry = false; } } while (null == towndocument); Elements towns = getTowns(towndocument, proName, cityName, countyName, countyFileName,OperationType.WRITE); if (IS_GET_STREET) { for (Element town : towns) { Elements townSelect = town.select(TDA); if (townSelect.size() > 0) { Element townCodeElement = townSelect.get(0); Element townNameElement = townSelect.get(1); String townName = townNameElement.text(); String streetUrl = townCodeElement.absUrl(HREF); String townFileName = streetUrl.split(base_URL)[1]; document streetdocument = null; do { streetdocument = getdocument(streetUrl); if (streetdocument == null) { isRetry = true; } else { isRetry = false; } } while (null == streetdocument); getStreets(streetdocument, proName, cityName, countyName, townName, townFileName,OperationType.WRITE); } } } } } } } } cyclicBarrier.await(); log.info("{} 所有区数据爬取完毕。", proName); } } catch (Exception exception) { exception.printStackTrace(); log.error("Write error,error = {}", exception); } } public static Elements getProvinces(document provincedocument,OperationType operationType) throws IOException { return getElements(provincedocument, AreaLevelEnum.PROVINCE,null,null,null,null, AreaLevelEnum.PROVINCE.getLevel().concat(HTML),operationType); } public static Elements getCities(document citydocument,String proName,String fileName,OperationType operationType) throws IOException { return getElements(citydocument,AreaLevelEnum.CITY,proName,null,null,null,fileName,operationType); } public static Elements getCounties(document countydocument,String proName,String cityName,String fileName,OperationType operationType) throws IOException { return getElements(countydocument,AreaLevelEnum.COUNTY,proName,cityName,null,null,fileName,operationType); } public static Elements getTowns(document towndocument,String proName,String cityName, String countyName,String fileName,OperationType operationType) throws IOException { return getElements(towndocument,AreaLevelEnum.TOWN,proName,cityName,countyName,null,fileName,operationType); } public static Elements getStreets(document streetdocument,String proName,String cityName, String countyName,String townName,String fileName,OperationType operationType) throws IOException { return getElements(streetdocument, AreaLevelEnum.VILLAGE,proName, cityName, countyName,townName,fileName,operationType); } private static Elements getElements(document document,AreaLevelEnum level,String proName, String cityName, String countyName, String townName, String fileName, OperationType operationType) throws IOException { try { String le = level.getLevel(); Elements elements = null; if(null != document) { if(AreaLevelEnum.PROVINCE == level || AreaLevelEnum.VILLAGE == level){ elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage().concat(TD)); }else if(AreaLevelEnum.VILLAGE != level){ elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage()); } if(OperationType.WRITE == operationType){ File file = new File(REGION.concat(fileName)); File parentFile = file.getParentFile(); if(!parentFile.exists()){ parentFile.mkdirs(); } if(!file.exists()){ file.createNewFile(); } FileUtils.writeStringToFile(file,document.html(), "gb2312"); String path = file.getAbsolutePath(); log.info("<---------------------------------->"); if(StringUtils.isNotEmpty(townName)){ log.info("{}-{}-{}-{} 所有街道/村 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,townName, path); }else if(StringUtils.isNotEmpty(countyName)){ log.info("{}-{}-{} 所有镇 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,path); }else if(StringUtils.isNotEmpty(cityName)){ log.info("{}-{} 所有区/县 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,path); }else if(StringUtils.isNotEmpty(proName)){ log.info("{} 所有市 网页数据写入到本地完成~~,文件所在地址->{}",proName,path); }else { log.info("所有省 网页数据写入到本地完成~~,文件所在地址:{}",path); } log.info("<---------------------------------->"); } } return elements; } catch (Exception e) { e.printStackTrace(); log.error("GetAndWriteProvince method error,error = {}",e); throw e; } } private static class SyncWrite implements Runnable { private Element element; private CyclicBarrier cyclicBarrier; public SyncWrite(Element element,CyclicBarrier cyclicBarrier){ this.element = element; this.cyclicBarrier = cyclicBarrier; } @Override public void run() { writeToLocal(element,cyclicBarrier); } } public static void write(){ if(file.list().length > 1){ log.info("所有省市区数据已经写入到本地完毕,若要更新所有省市区文件,请删除该文件夹即可,文件地址:{}",file.getAbsolutePath()); write.countDown(); return; } document document = getdocument(AREA_URL); System.out.println("等待所有 省份数据 爬取 完毕 !!!!!!!"); try{ Elements provinces = getProvinces(document,OperationType.WRITE); if(null != provinces && provinces.size() > 0){ CyclicBarrier cyclicBarrier = new CyclicBarrier(provinces.size()-1, new Runnable() { @Override public void run() { log.info("===================== 所有省数据写入完毕 ========================"); write.countDown(); } }); for(Element province : provinces){ threadPool.submit(new SyncWrite(province,cyclicBarrier)); } } }catch(Exception e){ e.printStackTrace(); } } private static document readStringTodocument(String fileAddr){ try{ return Jsoup.parse(FileUtils.readFileToString(new File(fileAddr),"gb2312")); }catch(Exception e){ e.printStackTrace(); log.error("Read string to document error,error = {}",e); } return null; } private static class Area implements Runnable { private Boolean isGetTown; private Boolean isGetStreet; private String url; private Province province; private CyclicBarrier cyclicBarrier; private ListprovincesList; public Area(Boolean isGetTown, Boolean isGetStreet, String url, Province province,CyclicBarrier cyclicBarrier ,List provincesList) { this.isGetTown = isGetTown; this.isGetStreet = isGetStreet; this.url = url; this.province = province; this.cyclicBarrier = cyclicBarrier; this.provincesList = provincesList; } @SneakyThrows @Override public void run() { Province province = this.province; String cityUrl = url.concat(province.getProvinceCode().concat(HTML)); document document = readStringTodocument(cityUrl); String cityLevel = AreaLevelEnum.CITY.getLevel(); Elements cityElements = document .select(MessageFormatter.format(TEMPLATE, cityLevel, cityLevel).getMessage()); List cities = Optional.ofNullable(cityElements).filter(a -> a.size() > 0).map(u -> { return u.stream().map(k -> { return Optional.ofNullable(k.select(TDA)).filter(a -> a.size() > 0).map(a -> { Element codeElement = a.get(0); Element nameElement = a.get(1); City city = new City(); city.setCityCode(codeElement.text().substring(0,6)); city.setCityName(nameElement.text()); city.setLevel(AreaLevelEnum.CITY.getLevel()); String countyUrl = url.concat(codeElement.attr(HREF)); document countydocument = readStringTodocument(countyUrl); String countLevel = AreaLevelEnum.COUNTY.getLevel(); Elements countyElements = countydocument .select(MessageFormatter.format(TEMPLATE, countLevel, countLevel).getMessage()); List counties = Optional.ofNullable(countyElements).filter(c -> c.size() > 0).map(x -> { return x.stream().map(c -> { return Optional.ofNullable(c.select(TDA)).filter(aq -> aq.size() > 0).map(aq -> { Element countyCode = aq.get(0); Element countyName = aq.get(1); County county = new County(); county.setCountyCode(countyCode.text().substring(0,6)); county.setCountyName(countyName.text()); county.setLevel(AreaLevelEnum.COUNTY.getLevel()); String townUrl = url.concat(countyCode.attr(HREF)); if (isGetTown) { document towndocument = readStringTodocument(townUrl); String townLevel = AreaLevelEnum.TOWN.getLevel(); Elements townElements = towndocument.select( MessageFormatter.format(TEMPLATE, townLevel, townLevel).getMessage()); List towns = Optional.ofNullable(townElements).filter(w -> w.size() > 0) .map(e -> { return e.stream().map(r -> { return Optional.ofNullable(e.select(TDA)) .filter(qq -> qq.size() > 0).map(qq -> { Element townCode = qq.get(0); Element townName = qq.get(1); Town town = new Town(); town.setLevel(AreaLevelEnum.TOWN.getLevel()); town.setTownCode(townCode.text()); town.setTownName(townName.text()); if (isGetStreet) { String streetUrl = url.concat(townCode.attr(HREF)); document streetdocument = readStringTodocument(streetUrl); String streetLevel = AreaLevelEnum.VILLAGE.getLevel(); Elements streetElements = streetdocument.select( MessageFormatter .format(TEMPLATE, streetLevel, streetLevel) .getMessage()); List streets = Optional .ofNullable(streetElements) .filter(t -> t.size() > 0).map(t -> { return t.stream().map(v -> { return Optional.ofNullable(v.select(TD)) .filter(we -> we.size() > 0).map(we -> { Element streetCode = we.get(0); Element streetTypeCode = we.get(1); Element streetName = we.get(2); Street street = new Street(); street.setLevel( AreaLevelEnum.VILLAGE .getLevel()); street.setStreetCode( streetCode.text()); street.setStreetTypeCode( streetTypeCode.text()); street.setStreetName( streetName.text()); return street; }).orElse(null); }).filter(rs -> null != rs) .collect(Collectors.toList()); }).orElse(null); town.setStreets(streets); } return town; }).orElse(null); }).filter(r -> null != r).collect(Collectors.toList()); }).orElse(null); county.setTowns(towns); } return county; }).orElse(null); }).filter(r -> null != r).collect(Collectors.toList()); }).orElse(null); city.setCounties(counties); return city; }).orElse(null); }).filter(r -> null != r).collect(Collectors.toList()); }).orElse(null); province.setCities(cities); System.out.println(province.getProvinceName() + ":" + gson.toJson(province)); provincesList.add(province); cyclicBarrier.await(); } } public static void main(String[] args) { try { write(); write.await(); String proLevel = AreaLevelEnum.PROVINCE.getLevel(); document prodocument = readStringTodocument(REGION.concat(File.separator).concat(proLevel).concat(HTML)); List provincesList = Lists.newlinkedList(); Elements provinces = getProvinces(prodocument, OperationType.READ); CyclicBarrier barrier = new CyclicBarrier(provinces.size() - 1, new Runnable() { @Override public void run() { log.info("所有省份json数据组装完毕!!!!"); Collections.sort(provincesList, Comparator.comparing(Province::getProvinceCode)); System.out.println("<---------- 执行结果开始 --------->"); System.out.println(gson.toJson(provincesList)); System.out.println("<---------- 执行结果结束 --------->"); threadPool.shutdown(); } }); Optional.ofNullable(provinces).filter(a -> a.size() > 0).ifPresent(cs -> { cs.stream().forEach(element -> { Elements a = element.select(A); if (StringUtils.isNotEmpty(a.text())) { Province province = new Province(); province.setLevel(AreaLevelEnum.PROVINCE.getLevel()); String code = a.attr(HREF).trim().substring(0, 2); province.setProvinceCode(code); String name = a.text(); province.setProvinceName(name); try { threadPool.execute(new Area(IS_GET_TOEN, IS_GET_STREET, REGION, province,barrier,provincesList)); } catch (Exception e) { e.printStackTrace(); } } }); }); } catch (Exception e) { e.printStackTrace(); log.error("Occur error,error = {}",e); } } }
最终结果:
项目和转出的json文件等审核完毕,我会发布到CSDN,勿催。
省市区Json文件地址:点击下载2020年国家省市区Json文件
爬虫项目地址:点击下载项目
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)