
''' 人人车 北京二手车 1.名称,2.价格,3.详情,4.图片,5.生产日期,6.公里数,7.首付,8.降价信息,9.车牌所在地,10.排放标准 11.过户记录,12.车主评价,13.车况信息——车辆外观,14.车况信息——车辆内饰,15.车况信息——车辆底盘,16.机构检测结果,17.年检到期时间 18.商业险到期时间,19.有无购车发票,20.是否4S店保养,21.交强险到期时间 ''' import pymysql import requests import time from bs4 import BeautifulSoup from pyquery import PyQuery as pq #用来存放二手汽车名称 car_name = [] #用来存放二手车价格 car_price = [] #用来存放汽车详情页链接 car_url = [] #用来存放二手车图片链接 car_picture_url = [] #用来存放二手车生产日期 car_date = [] #用来存放二手车行驶里程数 car_km = [] #用来存放二手车首付价格 car_pay = [] #用来存放降价信息 car_pi = [] #用来存放车牌所在地信息 car_location = [] #用来存放二手车排放标准 car_es = [] #用来存放二手车过户记录 car_tf = [] #用来存放车主评价 car_usertx = [] #用来存放车况信息 car_condit = [] #用来存放车况信息——车辆外观 car_condit_out = [] #用来存放车况信息——车辆内饰 car_condit_in = [] #用来存放车况信息——车辆底盘 car_condit_chassis = [] #用来存放机构检测结果 car_result = [] #用来存放年检到期时间 car_procedures_YearlyInspection = [] #用来存放商业险到期时间 car_Ciet = [] #有无购车发票 car_invoice = [] #是否4S店保养 car_maintain = [] #用来存放交强险到期时间 car_compulsory = [] page = 1 def db_mysql(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory): # 打开数据库 db = pymysql.connect(host='localhost', user='root', password='1234', port=3306, database='rrc') # 创建游标 cursor = db.cursor() i = 1 sql = 'insert into rrc_table(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory) ' 'values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' for name,date,km,price,pay,pi,url,picture_url,location,es,tf,usertx,condit_out,condit_in,condit_chassis,result,procedures_YearlyInspection,Ciet,invoice,maintain,compulsory in zip(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory): try: cursor.execute(sql,(name,date,km,price,pay,pi,url,picture_url,location,es,tf,usertx,condit_out,condit_in,condit_chassis,result,procedures_YearlyInspection,Ciet,invoice,maintain,compulsory)) db.commit() print("爬取成功") except: print("第" + str(i) + "条数据出现数据插入异常") db.rollback() i += 1 db.close() def getcar_main(): for x in range(0,50): base_url = 'https://www.renrenche.com/bj/ershouche/p' print('开始第'+str(x+1)+'页内容爬取') url = base_url + str(page) car_name = get_carname(url) car_date, car_km = get_producedate(url) car_price = get_price(url) car_pay = get_dp(url) car_pi = get_pi(url) car_url = get_carurl(url) car_picture_url = get_picture(url) print("正在抓取车牌所在地...") car_location = get_carLicense(car_url) print("正在抓取二手车排放标准...") car_es = get_es(car_url) print("正在抓取二手车过户记录...") car_tf = get_transfer(car_url) print("正在抓取车主评价...") car_usertx = get_omt(car_url) print("正在抓取车况信息...") car_condit_out, car_condit_in, car_condit_chassis = get_condit(car_url) print("正在抓取机构检测结果...") car_result = get_result(car_url) print("正在抓取车辆手续信息...") car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory = get_procedures(car_url) print("正在存入数据库...") db_mysql(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory) print('开始清空列表列表...') car_name.clear() car_date.clear() car_km.clear() car_price.clear() car_pay.clear() car_pi.clear() car_url.clear() car_picture_url.clear() car_location.clear() car_es.clear() car_tf.clear() car_usertx.clear() car_condit_out.clear() car_condit_in.clear() car_condit_chassis.clear() car_result.clear() car_procedures_YearlyInspection.clear() car_Ciet.clear() car_invoice.clear() car_maintain.clear() car_compulsory.clear() print('所有列表已清空') def get_page(url): try: Headers = {'user-agent': 'Mozilla/5.0'} rs = requests.get(url=url,headers = Headers) if rs.status_code == 200: html = rs.text doc = pq(html) return doc except: print("url出错了!") def get_in_page(url): try: Headers = {'user-agent': 'Mozilla/5.0'} rs = requests.get(url=url,headers = Headers) if rs.status_code == 200: html = rs.text doc = pq(html) return doc except: print("url出错了!") #function1:获取二手车详情页链接 def get_carurl(url): doc = get_page(url) for url in doc(".thumbnail ").items(): car_url.append('https://www.renrenche.com'+url.attr.href) return car_url #function2:获取二手车名称 def get_carname(url): doc = get_page(url) for name in doc(".schedule.btn-base.btn-wireframe").items(): car_name.append(name.attr('data-title')) return car_name #function3:获取二手车生产日期和公里数 def get_producedate(url): doc = get_page(url) for date in doc(".mileage").items(): car_date.append(date.text().split('/')[0]) car_km.append(date.text().split('/')[1]) return car_date,car_km #function4:获取二手车价格 def get_price(url): doc = get_page(url) for price in doc(".tags-box").children('.price').remove('.down-payment').items(): car_price.append(price.text()) return car_price #function5:获取二手车图片链接 def get_picture(url): doc = get_page(url) for p in doc(".thumbnail").children('.img-backgound').children('img').items(): if p.attr('data-src') is None: car_picture_url.append('https:'+p.attr('src')) else: car_picture_url.append('https:'+p.attr('data-src')) return car_picture_url #function6:获取二手车首付 def get_dp(url): doc = get_page(url) for pice in doc(".tags-box").items(): # print(len(pice.text().split("n"))) x = len(pice.text().split("n")) if x > 1: car_pay.append(pice.text().split("n")[2]) else: car_pay.append("不可首付") return car_pay #function7:获取二手车降价信息 def get_pi(url): doc = get_page(url) for pi in doc(".thumbnail").items(): if "已降" in pi.text().split("n"): car_pi.append("已降"+pi.text().split("n")[1]) else: car_pi.append("近期未降价") return car_pi #function7:获取二手车车牌所在地信息 def get_carLicense(car_url): for url in car_url: doc = get_in_page(url) for Lpl in doc("#car-licensed").items(): car_location.append(Lpl.text()) return car_location #function9:获取二手车排放标准 def get_es(car_url): for url in car_url: doc = get_in_page(url) for es in doc(".span5.car-fluid-standard .detail-version3-right-icon .car-summary").items(): car_es.append(es.text()) return car_es #function10:获取二手车过户记录 def get_transfer(car_url): for url in car_url: doc = get_in_page(url) for tf in doc("#zhimaicar-detail-header-right .row-fluid-wrapper .car-transfer .car-summary").items(): car_tf.append(tf.text()) return car_tf #function11:获取车主评价 def get_omt(car_url): for url in car_url: doc = get_in_page(url) for tx in doc(".text-about-car-owner .owner-main-text").items(): car_usertx.append(tx.text()) return car_usertx #function12:获取车况信息 def get_condit(car_url): for url in car_url: doc = get_in_page(url) for cd in doc("#gallery .detail-car-appearance-title .zhimai-subtitle").items(): car_condit.append(cd.text()) car_condit_out = car_condit[0::3] car_condit_in = car_condit[1::3] car_condit_chassis =car_condit[2::3] return car_condit_out,car_condit_in,car_condit_chassis #function13:获取机构检测结果 def get_result(car_url): for url in car_url: doc = get_in_page(url) for rs in doc(".report-inner-box .report-main .report-result-des").items(): car_result.append(rs.text()) return car_result #function14:获取车辆手续信息 def get_procedures(car_url): for url in car_url: doc = get_in_page(url) for pd in doc(".interval-title-content").items(): car_procedures_YearlyInspection.append(pd.text().split("n")[1]) car_Ciet.append(pd.text().split("n")[3]) car_invoice.append(pd.text().split("n")[5]) car_invoice.append(pd.text().split("n")[5]) car_maintain.append(pd.text().split("n")[7]) car_compulsory.append(pd.text().split("n")[9]) return car_procedures_YearlyInspection,car_Ciet,car_invoice,car_maintain,car_compulsory #运行程序 getcar_main()
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)