
https://www.bilibili.com/video/BV1LY411w76b/
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from selenium import webdriver
from time import sleep
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
from random import randint
import re
import os
"""
作者:数据小蜗(小蜗笔记)
https://blog.csdn.net/qq_42830971
仅作学习交流,用后删除,请勿修改频率代码,以免对计算机和网络造成损害,计算机 *** 作者承担全责。
"""
# In[2]:
info = pd.read_excel('.\信息集合.xlsx')
if os.path.exists('爬虫状态.xlsx'):
info_A = pd.read_excel('爬虫状态.xlsx')
else:
info_A = info.copy()
info_A['状态'] = None
info_A.to_excel('爬虫状态.xlsx')
info_index = [i for i in range(len(info_A.index))]
second_keys = info_A['指标地区']
first_keys = info_A['指标搜索词']
second_keys
if os.path.exists('./爬虫数据'):
pass
else:
os.makedirs('./爬虫数据//')
# In[3]:
edge = webdriver.Edge()
edge.get("https://data.cnki.net/ValueSearch/Index?datatype=year&ky=GDP")
sleep(10)
for first_key,second_key,i_A in zip(first_keys,second_keys,info_index):
print(first_key,second_key,i_A)
if info_A.iloc[i_A,-1] == '已完成':
pass
else:
edge.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/form/div/div[1]/input').clear()
edge.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/form/div/div[2]/input').clear()
edge.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/form/div/div[1]/input').send_keys("{}".format(first_key))
edge.find_element(By.XPATH,'/html/body/div[1]/div[3]/div[1]/form/div/div[2]/input').send_keys("{}".format(second_key))
sleep(1)
edge.find_element(By.XPATH, '//*[@id="AdvancedSearch"]').click()
sleep(20)
df = pd.DataFrame([], columns=['序号','时间','地区','指标','数值','单位','来源','页码','下载excel','下载caj'])
#初始化列表
xuhao = []
shijian = []
diqu = []
zhibiao = []
shuzhi = []
danwei = []
laiyuan = []
yema = []
xiazai_excel = []
xiazai_caj = []
page_num = int(1)
while True:
html = edge.page_source
e = etree.HTML(html)
time = 0
point = 0
xuhao_A = e.xpath('//*[@id="t1"]/tbody/tr/td[2]/text()')
if xuhao_A:
pass
else:
break
for i in range(len(xuhao_A)):
shijian_test = e.xpath('//*[@id="t1"]/tbody/tr/td[3]')[i].text # 最后一个
shijian_test = re.match('[0-9]+', shijian_test).group() # re提取数字
time = int(shijian_test)
#看时间,蹦出循环
if time < 2008:
point = 1
break
else:
xuhao.append(e.xpath('//*[@id="t1"]/tbody/tr/td[2]')[i].text)
shijian.append(e.xpath('//*[@id="t1"]/tbody/tr/td[3]')[i].text)
diqu.append(e.xpath('//*[@id="t1"]/tbody/tr/td[4]')[i].text)
zhibiao.append(e.xpath('//*[@id="t1"]/tbody/tr/td[5]')[i].text)
shuzhi.append(e.xpath('//*[@id="t1"]/tbody/tr/td[6]')[i].text)
danwei.append(e.xpath('//*[@id="t1"]/tbody/tr/td[7]')[i].text)
laiyuan.append(e.xpath('//*[@id="t1"]/tbody/tr/td[8]')[i].text)
yema.append(e.xpath('//*[@id="t1"]/tbody/tr/td[9]')[i].text)
xiazai_excel_is_null = e.xpath('(//*[@id="t1"]/tbody/tr/td[10])[{0}]/a[1]/@href'.format(i))
xiazai_caj_is_null = e.xpath('(//*[@id="t1"]/tbody/tr/td[10])[{0}]/a[2]/@href'.format(i))
if xiazai_excel_is_null:
xiazai_excel.append(xiazai_excel_is_null[0])
else:
xiazai_excel.append(None)
if xiazai_caj_is_null:
xiazai_caj.append(xiazai_caj_is_null[0])
else:
xiazai_caj.append(None)
#看时间,蹦出循环
if point == 1:
print(time)
break
#判断是否下一页
page_num_A = e.xpath('//*[@id="NextPage"]/a/@value')
if page_num_A:
page_num_A = int(page_num_A[0])
if page_num_A == page_num:
break
else:
page_num = int(page_num_A)
else:
break
edge.find_element(By.XPATH, '//*[@id="NextPage"]').click()
sleep(6)
for handle in edge.window_handles: # 方法二,始终获得当前最后的窗口
edge.switch_to.window(handle)
datadict = []
print(len(xuhao),len(shijian),len(diqu),len(zhibiao),len(shuzhi),len(danwei),len(laiyuan),len(yema),len(xiazai_caj),len(xiazai_excel))
datadict = {'序号': xuhao, '时间': shijian, '地区': diqu, '指标': zhibiao, '数值': shuzhi, '单位': danwei,
'来源': laiyuan,'页码': yema, '下载excel': xiazai_excel, '下载caj': xiazai_caj}
df_a = pd.DataFrame(datadict)
df_a.to_excel('./爬虫数据/地区{0}-指标{1}知网爬虫数据.xlsx'.format(second_key,first_key))
info_A.iloc[i_A,-1] = '已完成'
print(info_A.iloc[i_A,-1])
info_A.to_excel('爬虫状态.xlsx')
input()
print('爬虫100%')
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)