比这篇新的文章:
Codee#11797
比这篇旧的文章: C中的位段的使用
作者: sofoot, 点击104次, 评论(0), 收藏者(0), , 打分:
所有评论,共0条:( 我也来说两句)
比这篇旧的文章: C中的位段的使用
使用lxml解析html文件
语言: Python, 标签: lxml 2010/06/26发布 2个月前更新作者: sofoot, 点击104次, 评论(0), 收藏者(0), , 打分:
Python语言: 使用lxml解析html文件
01 #coding: utf-8
02
03 from lxml import *
04 import lxml.html as H
05 import urllib2
06
07 def getart(url):
08 f = urllib2.urlopen(url)
09 content = f.read()
10
11 art = []
12 doc = H.document_fromstring(content)
13 p = doc.xpath('/html/body/div[2]/div[2]/div/div/ul/li/div[2]/p')
14 for i in p:
15 tmp = i.text_content()
16 tmp = tmp.encode('utf-8')
17 tmp = tmp.replace('\r', '\n')
18 art.append(tmp)
19 return art
20
21 if __name__ == '__main__':
22 urls = ['http://www.douban.com/group/topic/12018319/',
23 'http://www.douban.com/group/topic/12018319/?start=100']
24 outfile = open('zheda.txt', 'w')
25 for url in urls:
26 art = getart(url)
27 for item in art:
28 outfile.write(item+'\n')
29 outfile.close()
02
03 from lxml import *
04 import lxml.html as H
05 import urllib2
06
07 def getart(url):
08 f = urllib2.urlopen(url)
09 content = f.read()
10
11 art = []
12 doc = H.document_fromstring(content)
13 p = doc.xpath('/html/body/div[2]/div[2]/div/div/ul/li/div[2]/p')
14 for i in p:
15 tmp = i.text_content()
16 tmp = tmp.encode('utf-8')
17 tmp = tmp.replace('\r', '\n')
18 art.append(tmp)
19 return art
20
21 if __name__ == '__main__':
22 urls = ['http://www.douban.com/group/topic/12018319/',
23 'http://www.douban.com/group/topic/12018319/?start=100']
24 outfile = open('zheda.txt', 'w')
25 for url in urls:
26 art = getart(url)
27 for item in art:
28 outfile.write(item+'\n')
29 outfile.close()
所有评论,共0条:( 我也来说两句)
代码
