
数据来源:中国统计局标准 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/
由于时间关系简单粗糙的写了份代码。可自行优化。如下 复制代码,能直接运行;
环境:jdk8;
org.jsoup
jsoup
1.11.3
import cn.hutool.http.HttpUtil;
import lombok.var;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
/**
* 抓取统计局区域编码
*/
public class TestArea {
public static void main(String[] args) throws InterruptedException {
test();
}
public static void test() throws InterruptedException {
var url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";
String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";
String html = HttpUtil.get(url);
var htmlDoc = Jsoup.parse(html);
var selectClasses = htmlDoc.getElementsByClass("provincetr");
for (int i = 0; i < selectClasses.size(); i++) {
var provideCodes = selectClasses.get(i).children();
//1.省份 provincetr
for (int provideCodeIndex = 0; provideCodeIndex < provideCodes.size(); provideCodeIndex++) {
var provideCodeUrl = provideCodes.get(provideCodeIndex).select("a").attr("href");
var provideName = provideCodes.get(provideCodeIndex).select("a").text();
System.out.println("省份 = " + provideName);
if (!StringUtils.isBlank(provideCodeUrl)) {
var provideCode = provideCodeUrl.split("\.")[0];
String gotoCityHtml = HttpUtil.get(baseUrl + provideCodeUrl);
var cityHtmlDoc = Jsoup.parse(gotoCityHtml);
Elements selectCityClass = cityHtmlDoc.select(".citytr");
Thread.sleep(2000);
//2.城市 citytr
for (int cityIndex = 0; cityIndex < selectCityClass.size(); cityIndex++) {
var gotoCountyUrl = selectCityClass.get(cityIndex).select("a").attr("href");
var cityName = selectCityClass.get(cityIndex).select("a").text();
System.out.println("城市 = " + cityName);
if (StringUtils.isBlank(gotoCountyUrl)) {
continue;
}
String countytr = HttpUtil.get(baseUrl + gotoCountyUrl);
var countytrDoc = Jsoup.parse(countytr);
Elements countyClass = countytrDoc.select(".countytr");
Thread.sleep(2000);
//3.县区 countytr
for (int county = 0; county < countyClass.size(); county++) {
var gotoTownUrl = countyClass.get(county).select("a").attr("href");
var countyName = countyClass.get(county).select("a").text();
System.out.println("县区 = " + countyName);
if (StringUtils.isBlank(gotoTownUrl)) {
continue;
}
String towntr = HttpUtil.get(baseUrl + provideCode + "/" + gotoTownUrl);
var townDoc = Jsoup.parse(towntr);
Elements townClass = townDoc.select(".towntr");
var gotoTownCode = gotoTownUrl.split("/")[0];
Thread.sleep(2000);
//4.街道。镇 towntr
for (int town = 0; town < townClass.size(); town++) {
//towntr
var gotoVillageHref = townClass.get(town).select("a").attr("href");
var townName = townClass.get(town).select("a").text();
System.out.println("街道。镇 = " + townName);
if (StringUtils.isBlank(gotoVillageHref)) {
continue;
}
//居委会
String villageStr = HttpUtil.get(baseUrl + provideCode + "/" + gotoTownCode + "/" + gotoVillageHref);
var villageDoc = Jsoup.parse(villageStr);
Elements villagetr = villageDoc.select(".villagetr");
for (int villageIndex = 0; villageIndex < villagetr.size(); villageIndex++) {
var tds = villagetr.get(villageIndex).select("td");//[0].text();
var text = tds.get(0).text();
var text1 = tds.get(1).text();
var text2 = tds.get(2).text();
System.out.println(text + " " + text1 + " " + text2);
}
System.out.println("---------");
Thread.sleep(2000);
}
}
}
}
}
}
}
}
获取最新的省市区县字典数据代码
免责申明:爬取数据造成任何问题,概不负责,本文只做技术分享和学习。
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)