
您可以使用递归函数来降低帖子。您只需要RSelenium即可获取页面源:
library(xml2)library(RSelenium)library(jsonlite)selServ <- startServer()appURL <- "http://disqus.com/embed/comments/?base=default&version=90aeb3a56d1f2d3db731af14996f11cf&f=malta-today&t_i=article_67726&t_u=http%3A%2F%2Fwww.maltatoday.com.mt%2Fnews%2Fnational%2F67726%2Fair_malta_pilots_demands_30_basic_salary_increase&t_d=Air%20Malta%20pilots%E2%80%99%20demands%3A%2030%25%20basic%20salary%20increase%2C%20increased%20duty%20payments%2C%20double%20%E2%80%98denied%20leave%E2%80%99%20payment&t_t=Air%20Malta%20pilots%E2%80%99%20demands%3A%2030%25%20basic%20salary%20increase%2C%20increased%20duty%20payments%2C%20double%20%E2%80%98denied%20leave%E2%80%99%20payment&s_o=default"remDr <- remoteDriver()remDr$open()remDr$navigate(appURL)pgSource <- remDr$getPageSource()[[1]]remDr$close()selServ$stop()doc <- read_html(pgSource)appNodes <- xml_find_all(doc, "//ul[@id='post-list']/li[@]")# write recursive function to get content_fun <- function(x){ main <- xml_find_all(x, "./div[@data-role]/.//div[@]") main <- list( poster = xml_text(xml_find_all(main, ".//span[@class = 'post-byline']")), posted = xml_text(xml_find_all(main, ".//span[@class = 'post-meta']")), date = xml_attr(xml_find_all(main, ".//a[@class = 'time-ago']"), "title"), message = xml_text(xml_find_all(main, ".//div[@data-role = 'message']")) ) # check for children children <- xml_find_all(x, "./ul[@]/li[@]") if(length(children) > 0){ main$children <- lapply(children, content_fun) } main}postData <- lapply(appNodes, content_fun)例如,这是第三篇文章
> prettify(toJSON(postData[[3]])){ "poster": [ "nMary Attardnn" ], "posted": [ "n•nnna month agonn" ], "date": [ "Thursday, July 21, 2016 6:12 AM" ], "message": [ "nI give up. Air Malta should be closed down.n" ], "children": [ { "poster": [ "nJoseph Lawrencenn Mary Attardn" ], "posted": [ "n•nnna month agonn" ], "date": [ "Thursday, July 21, 2016 7:43 AM" ], "message": [ "nAir Malta should have been privatized or sold out right a long time ago. It is costing the TAX PAYER millions, it has for a long, long time.n" ] }, { "poster": [ "nJ.Borgnn Mary Attardn" ], "posted": [ "n•nnna month agonn" ], "date": [ "Thursday, July 21, 2016 5:23 PM" ], "message": [ "nYes - at this stage we taxpayers will be better off without Air Malta. We closed Malta Dry Docks and we survived. We can close Air Malta and we'll survive even better. After all, we have many more airlines serving us.n" ] } ]}您可以清理并抓取所需的内容。
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)