
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import csv
def main():
# 1、确定url
base_url =input('请输入douyin用户搜索页面url:')
#'https://www.douyin.com/search/%E6%A2%97?source=switch_tab&type=user' #搜索页面关键词
# 2、使用slenium请求获取数据
# html_str = get_content_by_selenium(base_url)
# 使用selenium第二步
driver.get(base_url)
driver.set_window_size(1000,700)
# 等待
print('请尽快扫码')
time.sleep(15) #扫码验证
drop_down()
time.sleep(100)
# 获取当前driver的page_source
# 第四步
html_str = driver.page_source
#print(html_str)
# 3、解析
pase_page(html_str)
def pase_page(html_str):
'''
解析一页的直播信息
'''
tree = etree.HTML(html_str)
li_list = tree.xpath('//*[@]') #所有tr
#print(li_list)
datas = []
for li in li_list:
item = {}
username = li.xpath('.//span/text()')[0]
fans = li.xpath('.//span/text()')[3]
approve = li.xpath('.//span/text()')[4]
detail = li.xpath('.//span/text()')[5]
url = li.xpath('.//a/@href')[0]
#print(username,fans,approve,rate,detail)
item['username'] = username
item['approve'] = approve
item['fans'] = fans
item['detail'] = detail
item['url'] = url
datas.append(item)
header = ['username', 'fans', 'approve', 'detail', 'url']
with open('7.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=header) # 提前预览列名,当下面代码写入数据时,会将其一一对应。
writer.writeheader() # 写入列名
writer.writerows(datas) # 写入数据
def drop_down():
for x in range(1, 300, 4):
time.sleep(1)
j = x / 9
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
if __name__ == '__main__':
# 使用selenium的第一步:在全局变量中创建一个driver对象
# 解决优化的问题
driver = webdriver.Chrome()
wait = WebDriverWait(driver,20)
main()
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)