douyin关键词用户爬虫

douyin关键词用户爬虫,第1张

import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import csv

def main():
    # 1、确定url
    base_url =input('请输入douyin用户搜索页面url:')
    #'https://www.douyin.com/search/%E6%A2%97?source=switch_tab&type=user' #搜索页面关键词
    # 2、使用slenium请求获取数据
    # html_str = get_content_by_selenium(base_url)
    # 使用selenium第二步
    driver.get(base_url)
    driver.set_window_size(1000,700)
    # 等待
    print('请尽快扫码')
    time.sleep(15) #扫码验证

    drop_down()
    time.sleep(100)
    # 获取当前driver的page_source
    # 第四步

    html_str = driver.page_source
    #print(html_str)
    # 3、解析
    pase_page(html_str)

def pase_page(html_str):
    '''
    解析一页的直播信息
    '''
    tree = etree.HTML(html_str)
    li_list = tree.xpath('//*[@]') #所有tr
    #print(li_list)
    datas = []
    for li in li_list:
        item = {}
        username = li.xpath('.//span/text()')[0]
        fans = li.xpath('.//span/text()')[3]
        approve = li.xpath('.//span/text()')[4]
        detail = li.xpath('.//span/text()')[5]
        url = li.xpath('.//a/@href')[0]
        #print(username,fans,approve,rate,detail)

        item['username'] = username
        item['approve'] = approve
        item['fans'] = fans
        item['detail'] = detail
        item['url'] = url

        datas.append(item)

    header = ['username', 'fans', 'approve', 'detail', 'url']
    with open('7.csv', 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=header)  # 提前预览列名,当下面代码写入数据时,会将其一一对应。
        writer.writeheader()  # 写入列名
        writer.writerows(datas)  # 写入数据

def drop_down():
    for x in range(1, 300, 4):
        time.sleep(1)
        j = x / 9
        js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
        driver.execute_script(js)





if __name__ == '__main__':
    # 使用selenium的第一步:在全局变量中创建一个driver对象
    # 解决优化的问题
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver,20)
    main()

欢迎分享,转载请注明来源:内存溢出

原文地址:https://54852.com/langs/915578.html

(0)
打赏 微信扫一扫微信扫一扫 支付宝扫一扫支付宝扫一扫
上一篇 2022-05-16
下一篇2022-05-16

发表评论

登录后才能评论

评论列表(0条)

    保存