比这篇新的文章:
Codee#2587
比这篇旧的文章: Codee#2585
作者: 白羽扬, 点击167次, 评论(0), 收藏者(0), , 打分:
所有评论,共0条:( 我也来说两句)
比这篇旧的文章: Codee#2585
spider.py
语言: Python, 标签: 网络爬虫 spider 2009/07/01发布 8个月前更新作者: 白羽扬, 点击167次, 评论(0), 收藏者(0), , 打分:
Python语言: spider.py
01 #coding:utf-8
02 '''
03 Created on 2009-4-28
04
05 @author: baiyuyang
06 '''
07 import urllib
08 import re
09 import string
10 import os
11
12 #取得根url
13 def getMainUrl(dir,indexUrl):
14 mainUrl = dir+r'\Index.html'
15 f = open(mainUrl,'w')
16 f.write(urllib.urlopen(indexUrl).read().decode('gbk','replace').encode('utf-8').replace('gb2312','utf-8'))
17 f.close()
18 return mainUrl
19
20 #分析根url,得到一个url数组
21 def analysisUrl(mainUrl):
22 pattern = re.compile(r'http://[^>)"]+\.html|http://[^>)"]+\.htm|http://[^>)"]+\.jsp|http://[^>)"]+\.php')
23 url = pattern.findall(open(mainUrl).read())
24 print 'len(url):',len(url)
25 return url
26
27 #按url数组逐个抓取网页
28 def grabUrl(url,dir):
29 for index in range(0,len(url)):
30 try:
31 fileName = dir+'\%d.html'%index
32 f = open(fileName,'w')
33 print '正在抓取第%d个网页:'%index+url[index]
34 ftemp = urllib.urlopen(url[index],'w')
35 temp = ftemp.read().decode('gbk','ignore').encode('utf-8','ignore')
36 temp = temp.replace('gb2312','utf-8')
37 f.write(temp)
38 f.close()
39 except:
40 pass
41
42
43
44 def spider(root,dir):
45 indexUrl = root
46 print '开始抓取根url:'
47 mainUrl = getMainUrl(dir,indexUrl)
48 print ' 抓取根url完成,开始分析url:'
49 url = analysisUrl(mainUrl)
50 print '分析url完成,开始抓取:'
51 grabUrl(url,dir)
52
53 count = 0
54 for index in url:
55 if not os.path.exists(dir+'\root%d'%count):
56 dir = dir+'root%d'%count
57 os.mkdir(dir)
58 indexUrl = index
59 print '开始抓取根url:'
60 mainUrl = getMainUrl(dir,indexUrl)
61 print ' 抓取根url完成,开始分析url:'
62 url = analysisUrl(mainUrl)
63 print '分析url完成,开始抓取:'
64 grabUrl(url,dir)
65
66 count = count+1
67
68 print '抓取完成'
69
70
71
72 def main():
73 dir = r'E:\gxwWebPage\xinhuanet'
74 root = r'http://www.xinhuanet.com/politics/'
75 spider(root,dir)
76
77 if __name__ == '__main__':
78 main()
02 '''
03 Created on 2009-4-28
04
05 @author: baiyuyang
06 '''
07 import urllib
08 import re
09 import string
10 import os
11
12 #取得根url
13 def getMainUrl(dir,indexUrl):
14 mainUrl = dir+r'\Index.html'
15 f = open(mainUrl,'w')
16 f.write(urllib.urlopen(indexUrl).read().decode('gbk','replace').encode('utf-8').replace('gb2312','utf-8'))
17 f.close()
18 return mainUrl
19
20 #分析根url,得到一个url数组
21 def analysisUrl(mainUrl):
22 pattern = re.compile(r'http://[^>)"]+\.html|http://[^>)"]+\.htm|http://[^>)"]+\.jsp|http://[^>)"]+\.php')
23 url = pattern.findall(open(mainUrl).read())
24 print 'len(url):',len(url)
25 return url
26
27 #按url数组逐个抓取网页
28 def grabUrl(url,dir):
29 for index in range(0,len(url)):
30 try:
31 fileName = dir+'\%d.html'%index
32 f = open(fileName,'w')
33 print '正在抓取第%d个网页:'%index+url[index]
34 ftemp = urllib.urlopen(url[index],'w')
35 temp = ftemp.read().decode('gbk','ignore').encode('utf-8','ignore')
36 temp = temp.replace('gb2312','utf-8')
37 f.write(temp)
38 f.close()
39 except:
40 pass
41
42
43
44 def spider(root,dir):
45 indexUrl = root
46 print '开始抓取根url:'
47 mainUrl = getMainUrl(dir,indexUrl)
48 print ' 抓取根url完成,开始分析url:'
49 url = analysisUrl(mainUrl)
50 print '分析url完成,开始抓取:'
51 grabUrl(url,dir)
52
53 count = 0
54 for index in url:
55 if not os.path.exists(dir+'\root%d'%count):
56 dir = dir+'root%d'%count
57 os.mkdir(dir)
58 indexUrl = index
59 print '开始抓取根url:'
60 mainUrl = getMainUrl(dir,indexUrl)
61 print ' 抓取根url完成,开始分析url:'
62 url = analysisUrl(mainUrl)
63 print '分析url完成,开始抓取:'
64 grabUrl(url,dir)
65
66 count = count+1
67
68 print '抓取完成'
69
70
71
72 def main():
73 dir = r'E:\gxwWebPage\xinhuanet'
74 root = r'http://www.xinhuanet.com/politics/'
75 spider(root,dir)
76
77 if __name__ == '__main__':
78 main()
所有评论,共0条:( 我也来说两句)
代码
