
抓取一件商品的所有评论恐怕也无法做到。因为淘宝亚马逊之类都有验证措施。
#coding=utf-8import urllib2
import sys
import json
import re
#设置系统默认编码为utf-8
reload(sys)
sys.setdefaultencoding("utf-8")
#Only for python2
'''
只是简单的示例,没有检查无评论的情况,其它异常也可能没有检查,
你可以根据自己的需要再对代码修改
'''
#解析网页数据
def parseData(html_data, reg_str):
pattern = re.compile(reg_str)
result = re.search(pattern, html_data)
if result:
return result.groups()
#commodity_url 为商品详情页面
commodity_url = "http://item.taobao.com/item.htm?spm=a1z10.1-c.w4004-9140345655.2.y3LCj0&id=44454286657"
html_data = urllib2.urlopen(commodity_url).read()
#获取用户ID和商品ID
auction_msg = parseData(html_data, r'userNumId=(.*?)&auctionNumId=(.*?)&')
if not auction_msg:
print "Get reviews failed!"#获取失败,退出
sys.exit()
reviews_url = "http://rate.taobao.com/feedRateList.htm?callback=jsonp_reviews_list&currentPageNum=1&rateType=&orderType=sort_weight&userNumId=%s&auctionNumId=%s"%auction_msg
response = urllib2.urlopen(reviews_url)
reviews_data = response.read().decode("gbk")
#获取评论数据
json_str = parseData(reviews_data, r'\((.*)\)')[0]
if not json_str:
print "Get reviews failed!"#获取失败,退出
sys.exit()
jdata = json.loads(json_str)
#下面只打印了第一条评论,如需要打印所有,则遍历jdata["comments"]即可
print jdata["comments"][0]["content"]
'控件'一个textbox'一个Command'一个ListView1'一个WebBrowser'-------代码-------------------------- Private Sub Command1_Click()On Error Resume Next
Command2.Enabled = False
WebBrowser1.Navigate Text1.Text
Do
DoEvents
If WebBrowser1.Busy = False Then Exit Do
Loophtmlstr = WebBrowser1.Document.getElementById("J_showBuyerList").innerHTML
s1 = InStr(1, htmlstr, "http:")
s2 = InStr(s1, htmlstr, ",")
URL = Mid(htmlstr, s1, s2 - s1)
URL = Replace(URL, "amp", "")
WebBrowser1.Navigate URL
Do
DoEvents
If WebBrowser1.Busy = False Then Exit Do
Loop GetTable
Command2.Enabled = True
End Sub Private Sub Form_Load()
With ListView1
.ColumnHeaders.Add , , "买家", 2200
.ColumnHeaders.Add , , "宝贝名称", 3000
.ColumnHeaders.Add , , "出价", 1000
.ColumnHeaders.Add , , "购买数量", 1000
.ColumnHeaders.Add , , "成交时间", 1800
.ColumnHeaders.Add , , "状态", 1000
.View = lvwReport
End With
End SubPrivate Sub WebBrowser1_NewWindow2(ppDisp As Object, Cancel As Boolean)
Set ppDisp = WebBrowser1.Application
End SubPrivate Sub GetTable()
On Error Resume Next
Dim rcount As Integer
For Each X In WebBrowser1.Document.All
'DoEvents
If X.tagname = "TABLE" Thenrcount = X.rows.length
For i = 1 To X.rows.length - 1 ' 逐行处理
Set Row = X.rows(i)
Set Item = ListView1.ListItems.Add(, , Row.cells(0).innerText)
Item.SubItems(1) = Row.cells(1).innerText
Item.SubItems(2) = Row.cells(2).innerText
Item.SubItems(3) = Row.cells(3).innerText
Item.SubItems(4) = Row.cells(4).innerText
Item.SubItems(5) = Row.cells(5).innerText
Next End If
NextIf rcount = 16 ThenFor Each X In WebBrowser1.Document.All
'DoEvents
If X.tagname = "A" ThenIf X.innerHTML = "<SPAN>下一页</SPAN>" Then
WebBrowser1.Navigate X.href
Do
DoEvents
If WebBrowser1.Busy = False Then Exit Do
LoopGetTable
End If End If
NextEnd If
End Sub
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)