比这篇新的文章:
获取HTML span标签中间的内容
比这篇旧的文章: 学习Ruby的过程中的练习代码
作者: 半瓶墨水, 点击3265次, 评论(2), 收藏者(6), , 打分:
所有评论,共2条:( 我也来说两句)
比这篇旧的文章: 学习Ruby的过程中的练习代码
校内网发帖机in Python,请勿滥用
语言: Python, 标签: 美丽的汤 校内 BeautifulSoup 2008/05/25发布 1年前更新作者: 半瓶墨水, 点击3265次, 评论(2), 收藏者(6), , 打分:
Python语言: 校内网发帖机in Python,请勿滥用
01 #!/usr/bin/python
02 #encoding=utf-8
03 #使用前请查找并更改用户名和密码
04
05 import cookielib, urllib2, urllib, sys, time
06 from xml.sax.saxutils import unescape
07 from BeautifulSoup import BeautifulSoup # For processing HTML
08
09 def formalize(text):
10 result = ''
11 lines = text.split(u'\n')
12 for line in lines:
13 line = line.strip()
14 if len(line) == 0:
15 continue
16 result += line + u'\n\n'
17 return result
18
19 #登陆校内网
20 cj = cookielib.CookieJar()
21 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
22 exheaders = [("User-Agent","Mozilla/4.0 (compatible; MSIE 7.1; Windows NT 5.1; SV1)"),]
23 opener.addheaders=exheaders
24 url_login = 'http://xiaonei.com/Login.do'
25 body = (('email','xxxxx@gmail.com'), ('password','*********')) #TODO:更改登录名和密码
26 print "ERROR! you need to update the password to be successful!"
27 req1 = opener.open(url_login, urllib.urlencode(body)) #这时,cookie已经进来了。
28
29 #下载糗事百科,一个个发帖
30 body = {'relative_optype':'publisher', 'blogControl':'1'}
31 url_post = 'http://blog.xiaonei.com/NewEntry.do'
32
33 #发帖部分
34 count = 0
35 for i in range(11, 12):
36 url = "http://qiushibaike.com/qiushi/best/all/page/%d" % i
37 data = urllib2.urlopen(url).readlines()
38 soup = BeautifulSoup("".join(data))
39 contents = soup.findAll('div', "content")
40 stories = [str(text) for text in contents]
41 for story in stories:
42 count += 1
43 print "processing page %d, %d items added" % (i, count)
44 minisoup = BeautifulSoup(story)
45 #text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
46 #text = urllib.unquote(unescape(text, {'"':'"'}))
47 text = str(minisoup)
48 #text = text.encode("utf-8")
49 title = '糗事-%d' % count
50 text += '<br/><a href="http://www.qiushibaike.com">来自糗事百科</a><br/>'
51 body['title'] = title
52 body['body'] = text
53 req2 = opener.open(url_post, urllib.urlencode(body)) #不出意外的话,就已经发帖成功了
02 #encoding=utf-8
03 #使用前请查找并更改用户名和密码
04
05 import cookielib, urllib2, urllib, sys, time
06 from xml.sax.saxutils import unescape
07 from BeautifulSoup import BeautifulSoup # For processing HTML
08
09 def formalize(text):
10 result = ''
11 lines = text.split(u'\n')
12 for line in lines:
13 line = line.strip()
14 if len(line) == 0:
15 continue
16 result += line + u'\n\n'
17 return result
18
19 #登陆校内网
20 cj = cookielib.CookieJar()
21 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
22 exheaders = [("User-Agent","Mozilla/4.0 (compatible; MSIE 7.1; Windows NT 5.1; SV1)"),]
23 opener.addheaders=exheaders
24 url_login = 'http://xiaonei.com/Login.do'
25 body = (('email','xxxxx@gmail.com'), ('password','*********')) #TODO:更改登录名和密码
26 print "ERROR! you need to update the password to be successful!"
27 req1 = opener.open(url_login, urllib.urlencode(body)) #这时,cookie已经进来了。
28
29 #下载糗事百科,一个个发帖
30 body = {'relative_optype':'publisher', 'blogControl':'1'}
31 url_post = 'http://blog.xiaonei.com/NewEntry.do'
32
33 #发帖部分
34 count = 0
35 for i in range(11, 12):
36 url = "http://qiushibaike.com/qiushi/best/all/page/%d" % i
37 data = urllib2.urlopen(url).readlines()
38 soup = BeautifulSoup("".join(data))
39 contents = soup.findAll('div', "content")
40 stories = [str(text) for text in contents]
41 for story in stories:
42 count += 1
43 print "processing page %d, %d items added" % (i, count)
44 minisoup = BeautifulSoup(story)
45 #text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
46 #text = urllib.unquote(unescape(text, {'"':'"'}))
47 text = str(minisoup)
48 #text = text.encode("utf-8")
49 title = '糗事-%d' % count
50 text += '<br/><a href="http://www.qiushibaike.com">来自糗事百科</a><br/>'
51 body['title'] = title
52 body['body'] = text
53 req2 = opener.open(url_post, urllib.urlencode(body)) #不出意外的话,就已经发帖成功了
所有评论,共2条:( 我也来说两句)
| 1 |
jfxwc
1年前
回复
非常cool的代码。墨水实在太棒了。
|
| 2 |
1 糗事百科的地址变动了
|
代码
