
简单写了个,只是爬链接的,加上标题老报错,暂时没看出来原因,先给你粘上来吧(方法2无问题)
from
BeautifulSoup
import
BeautifulSoup
import
urllib2
import
re
def
grabHref(url,localfile):
html
=
urllib2.urlopen(url).read()
html
=
unicode(html,'gb2312','ignore').encode('utf-8','ignore')
content
=
BeautifulSoup(html).findAll('a')
myfile
=
open(localfile,'w')
pat
=
re.compile(r'href="([^"]*)"')
pat2
=
re.compile(r'/tools/')
for
item
in
content:
h
=
pat.search(str(item))
href
=
h.group(1)
if
pat2.search(href):
#
s
=
BeautifulSoup(item)
#
myfile.write(s.a.string)
#
myfile.write('\r\n')
myfile.write(href)
myfile.write('\r\n')
#
s.a.sting
href
myfile.close()
def
main():
url
=
"http://www.freebuf.com/tools"
localfile
=
'aHref.txt'
grabHref(url,localfile)
if
__name__=="__main__":
main()
方法2:Re版
由于方法1有问题,只能获取到下载页面链接,所以换用Re解决,代码如下:
import
urllib2
import
re
url
=
'http://www.freebuf.com/tools'
find_re
=
re.compile(r'href="([^"]*)".+?>(.+?)</a>')
pat2
=
re.compile(r'/tools/')
html
=
urllib2.urlopen(url).read()
html
=
unicode(html,'utf-8','ignore').encode('gb2312','ignore')
myfile
=
open('aHref.txt','w')
for
x
in
find_re.findall(html):
if
pat2.search(str(x)):
>>myfile,x[0],x[1]
myfile.close()
'Done!'
cin.ignore(a,ch)方法是从输入流(cin)中提取字符,提取的字符被忽略(ignore),不被使用。每抛弃一个字符,它都要计数和比较字符:如果计数值达到a或者被抛弃的字符是ch,则cin.ignore()函数执行终止;否则,它继续等待。它的一个常用功能就是用来清除以回车结束的输入缓冲区的内容,消除上一次输入对下一次输入的影响。比如可以这么用:cin.ignore(1024,'\n'),通常把第一个参数设置得足够大,这样实际上总是只有第二个参数'\n'起作用,所以这一句就是把回车(包括回车)之前的所以字符从输入缓冲(流)中清除出去。你不给参数,则默认参数为cin.ignore(1,EOF),即把EOF前的1个字符清掉,没有遇到EOF就清掉一个字符然后结束,就是这样啊,所以你每次都少一个字符.var HTMLFormat = (function() {
function style_html(html_source, indent_size, indent_character, max_char) {
var Parser, multi_parser
function Parser() {
this.pos = 0
this.token = ''
this.current_mode = 'CONTENT'
this.tags = {
parent: 'parent1',
parentcount: 1,
parent1: ''
}
this.tag_type = ''
this.token_text = this.last_token = this.last_text = this.token_type = ''
this.Utils = {
whitespace: "\n\r\t ".split(''),
single_token: 'br,input,link,meta,!doctype,basefont,base,area,hr,wbr,param,img,isindex,?xml,embed'.split(','),
extra_liners: 'head,body,/html'.split(','),
in_array: function(what, arr) {
for (var i = 0 i < arr.length i++) {
if (what === arr[i]) {
return true
}
}
return false
}
}
this.get_content = function() {
var char = ''
var content = []
var space = false
while (this.input.charAt(this.pos) !== '<') {
if (this.pos >= this.input.length) {
return content.length ? content.join('') : ['', 'TK_EOF']
}
char = this.input.charAt(this.pos)
this.pos++
this.line_char_count++
if (this.Utils.in_array(char, this.Utils.whitespace)) {
if (content.length) {
space = true
}
this.line_char_count--
continue
} else if (space) {
if (this.line_char_count >= this.max_char) {
content.push('\n')
for (var i = 0 i < this.indent_level i++) {
content.push(this.indent_string)
}
this.line_char_count = 0
} else {
content.push(' ')
this.line_char_count++
}
space = false
}
content.push(char)
}
return content.length ? content.join('') : ''
}
this.get_script = function() {
var char = ''
var content = []
var reg_match = new RegExp('\<\/script' + '\>', 'igm')
reg_match.lastIndex = this.pos
var reg_array = reg_match.exec(this.input)
var end_script = reg_array ? reg_array.index : this.input.length
while (this.pos < end_script) {
if (this.pos >= this.input.length) {
return content.length ? content.join('') : ['', 'TK_EOF']
}
char = this.input.charAt(this.pos)
this.pos++
content.push(char)
}
return content.length ? content.join('') : ''
}
this.record_tag = function(tag) {
if (this.tags[tag + 'count']) {
this.tags[tag + 'count']++
this.tags[tag + this.tags[tag + 'count']] = this.indent_level
} else {
this.tags[tag + 'count'] = 1
this.tags[tag + this.tags[tag + 'count']] = this.indent_level
}
this.tags[tag + this.tags[tag + 'count'] + 'parent'] = this.tags.parent
this.tags.parent = tag + this.tags[tag + 'count']
}
this.retrieve_tag = function(tag) {
if (this.tags[tag + 'count']) {
var temp_parent = this.tags.parent
while (temp_parent) {
if (tag + this.tags[tag + 'count'] === temp_parent) {
break
}
temp_parent = this.tags[temp_parent + 'parent']
}
if (temp_parent) {
this.indent_level = this.tags[tag + this.tags[tag + 'count']]
this.tags.parent = this.tags[temp_parent + 'parent']
}
delete this.tags[tag + this.tags[tag + 'count'] + 'parent']
delete this.tags[tag + this.tags[tag + 'count']]
if (this.tags[tag + 'count'] == 1) {
delete this.tags[tag + 'count']
} else {
this.tags[tag + 'count']--
}
}
}
this.get_tag = function() {
var char = ''
var content = []
var space = false
do {
if (this.pos >= this.input.length) {
return content.length ? content.join('') : ['', 'TK_EOF']
}
char = this.input.charAt(this.pos)
this.pos++
this.line_char_count++
if (this.Utils.in_array(char, this.Utils.whitespace)) {
space = true
this.line_char_count--
continue
}
if (char === "'" || char === '"') {
if (!content[1] || content[1] !== '!') {
char += this.get_unformatted(char)
space = true
}
}
if (char === '=') {
space = false
}
if (content.length && content[content.length - 1] !== '=' && char !== '>' && space) {
if (this.line_char_count >= this.max_char) {
this.print_newline(false, content)
this.line_char_count = 0
} else {
content.push(' ')
this.line_char_count++
}
space = false
}
content.push(char)
} while (char !== '>')
var tag_complete = content.join('')
var tag_index
if (tag_complete.indexOf(' ') != -1) {
tag_index = tag_complete.indexOf(' ')
} else {
tag_index = tag_complete.indexOf('>')
}
var tag_check = tag_complete.substring(1, tag_index).toLowerCase()
if (tag_complete.charAt(tag_complete.length - 2) === '/' || this.Utils.in_array(tag_check, this.Utils.single_token)) {
this.tag_type = 'SINGLE'
} else if (tag_check === 'script') {
this.record_tag(tag_check)
this.tag_type = 'SCRIPT'
} else if (tag_check === 'style') {
this.record_tag(tag_check)
this.tag_type = 'STYLE'
} else if (tag_check.charAt(0) === '!') {
if (tag_check.indexOf('[if') != -1) {
if (tag_complete.indexOf('!IE') != -1) {
var comment = this.get_unformatted('-->', tag_complete)
content.push(comment)
}
this.tag_type = 'START'
} else if (tag_check.indexOf('[endif') != -1) {
this.tag_type = 'END'
this.unindent()
} else if (tag_check.indexOf('[cdata[') != -1) {
var comment = this.get_unformatted(']]>', tag_complete)
content.push(comment)
this.tag_type = 'SINGLE'
} else {
var comment = this.get_unformatted('-->', tag_complete)
content.push(comment)
this.tag_type = 'SINGLE'
}
} else {
if (tag_check.charAt(0) === '/') {
this.retrieve_tag(tag_check.substring(1))
this.tag_type = 'END'
} else {
this.record_tag(tag_check)
this.tag_type = 'START'
}
if (this.Utils.in_array(tag_check, this.Utils.extra_liners)) {
this.print_newline(true, this.output)
}
}
return content.join('')
}
this.get_unformatted = function(delimiter, orig_tag) {
if (orig_tag && orig_tag.indexOf(delimiter) != -1) {
return ''
}
var char = ''
var content = ''
var space = true
do {
char = this.input.charAt(this.pos)
this.pos++
if (this.Utils.in_array(char, this.Utils.whitespace)) {
if (!space) {
this.line_char_count--
continue
}
if (char === '\n' || char === '\r') {
content += '\n'
for (var i = 0 i < this.indent_level i++) {
content += this.indent_string
}
space = false
this.line_char_count = 0
continue
}
}
content += char
this.line_char_count++
space = true
} while (content.indexOf(delimiter) == -1)
return content
}
this.get_token = function() {
var token
if (this.last_token === 'TK_TAG_SCRIPT') {
var temp_token = this.get_script()
if (typeof temp_token !== 'string') {
return temp_token
}
//token = js_beautify(temp_token, this.indent_size, this.indent_character, this.indent_level)
//return [token, 'TK_CONTENT']
return [temp_token, 'TK_CONTENT']
}
if (this.current_mode === 'CONTENT') {
token = this.get_content()
if (typeof token !== 'string') {
return token
} else {
return [token, 'TK_CONTENT']
}
}
if (this.current_mode === 'TAG') {
token = this.get_tag()
if (typeof token !== 'string') {
return token
} else {
var tag_name_type = 'TK_TAG_' + this.tag_type
return [token, tag_name_type]
}
}
}
this.printer = function(js_source, indent_character, indent_size, max_char) {
this.input = js_source || ''
this.output = []
this.indent_character = indent_character || ' '
this.indent_string = ''
this.indent_size = indent_size || 2
this.indent_level = 0
this.max_char = max_char || 70
this.line_char_count = 0
for (var i = 0 i < this.indent_size i++) {
this.indent_string += this.indent_character
}
this.print_newline = function(ignore, arr) {
this.line_char_count = 0
if (!arr || !arr.length) {
return
}
if (!ignore) {
while (this.Utils.in_array(arr[arr.length - 1], this.Utils.whitespace)) {
arr.pop()
}
}
arr.push('\n')
for (var i = 0 i < this.indent_level i++) {
arr.push(this.indent_string)
}
}
this.print_token = function(text) {
this.output.push(text)
}
this.indent = function() {
this.indent_level++
}
this.unindent = function() {
if (this.indent_level > 0) {
this.indent_level--
}
}
}
return this
}
multi_parser = new Parser()
multi_parser.printer(html_source, indent_character, indent_size)
while (true) {
var t = multi_parser.get_token()
multi_parser.token_text = t[0]
multi_parser.token_type = t[1]
if (multi_parser.token_type === 'TK_EOF') {
break
}
switch (multi_parser.token_type) {
case 'TK_TAG_START':
case 'TK_TAG_SCRIPT':
case 'TK_TAG_STYLE':
multi_parser.print_newline(false, multi_parser.output)
multi_parser.print_token(multi_parser.token_text)
multi_parser.indent()
multi_parser.current_mode = 'CONTENT'
break
case 'TK_TAG_END':
multi_parser.print_newline(true, multi_parser.output)
multi_parser.print_token(multi_parser.token_text)
multi_parser.current_mode = 'CONTENT'
break
case 'TK_TAG_SINGLE':
multi_parser.print_newline(false, multi_parser.output)
multi_parser.print_token(multi_parser.token_text)
multi_parser.current_mode = 'CONTENT'
break
case 'TK_CONTENT':
if (multi_parser.token_text !== '') {
multi_parser.print_newline(false, multi_parser.output)
multi_parser.print_token(multi_parser.token_text)
}
multi_parser.current_mode = 'TAG'
break
}
multi_parser.last_token = multi_parser.token_type
multi_parser.last_text = multi_parser.token_text
}
return multi_parser.output.join('')
}
return function(data) {
var dataHolder = ['__dataHolder_', [Math.random(), Math.random(), Math.random(), Math.random()].join('_').replace(/[^0-9]/g, '_'), '_'].join('_')
var dataHolders = {}
var index = 0
data = data.replace(/(\")(data:[^\"]*)(\")/g, function($0, $1, $2, $3) {
var name = dataHolder + index++
dataHolders[name] = $2
return $1 + name + $3
})
data = style_html(data, 1, '\t', 0x10000000)
data = data.replace(new RegExp(dataHolder + '[0-9]+', 'g'), function($0) {
return dataHolders[$0]
})
return data
}
})()
// demo:
var formattedHtml = HTMLFormat('<div><span><br>xxx</span></div>')
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)