大学排名附件
3,使用的技术- Selenium 4.0.0 浏览器端自动化
org.seleniumhq.selenium selenium-java${selenium.version}
- hutool 5.7.14 工具包
- kumo 1.28 词云
4,核心代码 4.1,创建Drivercom.kennycason kumo-core${kumo.version} com.kennycason kumo-tokenizers${kumo.version}
public static WebDriver createWebDriver(boolean debug,String ... debuggerAddress){ // 设置驱动 //System.setProperty("webdriver.chrome.driver", getProps().getStr("webdriver.qq.driver")); // 360 System.setProperty("webdriver.chrome.driver", getProps().getStr("webdriver.360.driver")); //调用谷歌 ChromeOptions options = new ChromeOptions(); //指定浏览器安装位置 options.setBinary(getProps().getStr("webdriver.360.bin")); // 管理员权限运行 --no-sandbox // 不打开界面 --headless options.addArguments("--no-sandbox"); // 接受非安全的 options.setAcceptInsecureCerts(true); if(debug && debuggerAddress.length > 0){ // 在cmd中执行 C:/Users/A/AppData/Local/Google/Chrome/Application/chrome.exe --remote-debugging-port=44444 // 并配置debuggerAddress,即可重用已经打开的谷歌浏览器 options.setExperimentalOption("debuggerAddress", debuggerAddress[0]); } return new ChromeDriver(options); }4.2,爬取大学数据并生成词云
package com.lcj.selenium.instance; import cn.hutool.core.io.FileUtil; import cn.hutool.core.lang.Console; import cn.hutool.core.text.csv.*; import cn.hutool.core.util.ReUtil; import cn.hutool.core.util.StrUtil; import com.kennycason.kumo.CollisionMode; import com.kennycason.kumo.WordCloud; import com.kennycason.kumo.WordFrequency; import com.kennycason.kumo.bg.CircleBackground; import com.kennycason.kumo.font.KumoFont; import com.kennycason.kumo.font.scale.SqrtFontScalar; import com.kennycason.kumo.nlp.FrequencyAnalyzer; import com.kennycason.kumo.nlp.tokenizers.ChineseWordTokenizer; import com.kennycason.kumo.palette.LinearGradientColorPalette; import com.lcj.selenium.utils.SeleniumUtil; import org.openqa.selenium.By; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import java.awt.*; import java.io.File; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; public class ChinaUniversityRankCreawler { // 大学排名地址 private static final String URL = "https://www.shanghairanking.cn/rankings/bcur/2021"; // 保存全部大学集合 private static final ListUNIVERSITY_INFO_LIST = new ArrayList<>(528); // 数据保存目录 private static final String USER_HOME = System.getProperty("user.home"); public static void main(String[] args) { WebDriver driver = SeleniumUtil.createWebDriver(false, "127.0.0.1:44444"); // 最大化网页窗口 driver.manage().window().maximize(); // 全局等待时间 driver.manage().timeouts().implicitlyWait(30, TimeUnit.SECONDS); // 打开页面 driver.get(URL); Console.log("当前网页地址: {} ,标题: {}", driver.getCurrentUrl(), driver.getTitle()); // 文件名字 String fileName = driver.findElement(By.cssSelector("#content > div.content-title > h1")).getText().trim(); fileName = StrUtil.format("{}{}{}.csv", USER_HOME,File.separator,fileName); Console.log(fileName); // 大学总数 int universityCount = ReUtil.getFirstNumber(driver.findElement(By.cssSelector("#content-box > div.tool-box > div > div:nth-child(3)")).getText().trim()); int size = 30; int pages = (int) Math.ceil(universityCount * 1.0 / size); // 文件在当前目录是否存在 if (!FileUtil.exist(fileName)) { // 解析写一页大学数据 for (int i = 1; i <= pages; i++) { if (i > 1) { WebElement nextPage = driver.findElement(By.cssSelector("#content-box > ul > li.ant-pagination-next > a")); nextPage.click(); } // 解析大学数据 parseUniversityInfo(driver); } // 数据保存到csv文件中 try (final CsvWriter csvWriter = CsvUtil.getWriter(new File(fileName), StandardCharsets.UTF_8);) { csvWriter.write(UNIVERSITY_INFO_LIST); csvWriter.flush(); } } // 生成词云 generateWordCloud(fileName); driver.quit(); System.exit(0); } private static void generateWordCloud(String fileName) { final FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer(); // 设置分词返回数量(频率最高的600个词) frequencyAnalyzer.setWordFrequenciesToReturn(600); // 最小分词长度 frequencyAnalyzer.setMinWordLength(2); // 中文分词 frequencyAnalyzer.setWordTokenizer(new ChineseWordTokenizer()); // 从文件中读取 CsvReader reader = CsvUtil.getReader(); UNIVERSITY_INFO_LIST.clear(); //从文件中读取CSV数据 CsvData data = reader.read(FileUtil.file(fileName), StandardCharsets.UTF_8); List rows = data.getRows(); //遍历行 for (CsvRow csvRow : rows) { //getRawList返回一个List列表,列表的每一项为CSV中的一个单元格(既逗号分隔部分) UNIVERSITY_INFO_LIST.add(new String[]{csvRow.get(0),csvRow.get(1),csvRow.get(2),csvRow.get(3),csvRow.get(4),csvRow.get(5),csvRow.get(6)}); } // 省份 List provinceList = UNIVERSITY_INFO_LIST.stream().map(s -> s[4]).collect(Collectors.toList()); // 类型 List categoryList = UNIVERSITY_INFO_LIST.stream().map(s -> s[5]).collect(Collectors.toList()); provinceList.addAll(categoryList); List provinceFrequencies = frequencyAnalyzer.load(provinceList); //设置图片分辨率 Dimension dimension = new Dimension(500, 500); //此处的设置采用内置常量即可,生成词云对象 WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT); //此处不设置会出现中文乱码 java.awt.Font font = new java.awt.Font("STSong-Light", 2, 18); wordCloud.setKumoFont(new KumoFont(font)); // 字体大小范围 wordCloud.setFontScalar(new SqrtFontScalar(12, 42)); //设置边界及字体 wordCloud.setPadding(2); //因为我这边是生成一个圆形,这边设置圆的半径 wordCloud.setBackground(new CircleBackground(255)); //设置词云显示的三种颜色,越靠前设置表示词频越高的词语的颜色 wordCloud.setColorPalette(new LinearGradientColorPalette(Color.RED, Color.BLUE, Color.GREEN, 30, 30)); // 图片背景色 wordCloud.setBackgroundColor(new Color(255, 255, 255)); wordCloud.build(provinceFrequencies); //生成词云图路径 wordCloud.writeToFile(StrUtil.format("{}{}{}.png",USER_HOME,File.separator,FileUtil.getName(fileName))); } private static void parseUniversityInfo(WebDriver driver) { List trs = driver.findElements(By.cssSelector("#content-box > div.rk-table-box > table > tbody > tr")); trs.forEach(tr -> { // 排名 String ranking = tr.findElement(By.cssSelector("td:nth-child(1) > div")).getText(); // logo String logo = tr.findElement(By.cssSelector("td.align-left > div > div.logo > img")).getAttribute("src"); // 名字 String zhName = tr.findElement(By.cssSelector("td.align-left > div > div.univname > div:nth-child(1) > div > div > a")).getText(); String enName = tr.findElement(By.cssSelector("td.align-left > div > div.univname > div:nth-child(2) > div > div > a")).getText(); // 省市 String province = tr.findElement(By.cssSelector("td:nth-child(3)")).getText().trim(); // 类型 String category = tr.findElement(By.cssSelector("td:nth-child(4)")).getText().trim(); // 总分 String score = tr.findElement(By.cssSelector("td:nth-child(5)")).getText().trim(); Console.log("排名:{} ,logo:{} ,中文名称:{} ,英文名称:{} ,省份:{} ,类型:{} ,总分:{}", ranking, logo, zhName, enName, province, category, score); UNIVERSITY_INFO_LIST.add(new String[]{ranking, logo, zhName, enName, province, category, score}); }); } }
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)