比这篇新的文章: Codee#2587
比这篇旧的文章: Codee#2585

spider.py

语言: Python, 标签: 网络爬虫 spider 2009/07/01发布 8个月前更新
作者: 白羽扬, 点击167次, 评论(0), 收藏者(0), , 打分:

背景
主题: 字体:
Python语言: spider.py
01 #coding:utf-8
02 '''
03 Created on 2009-4-28
04
05 @author: baiyuyang
06 '''
07 import urllib
08 import re
09 import string
10 import os
11
12 #取得根url
13 def getMainUrl(dir,indexUrl):
14     mainUrl = dir+r'\Index.html'
15     f = open(mainUrl,'w')
16     f.write(urllib.urlopen(indexUrl).read().decode('gbk','replace').encode('utf-8').replace('gb2312','utf-8'))
17     f.close()
18     return mainUrl
19
20 #分析根url,得到一个url数组
21 def analysisUrl(mainUrl):
22     pattern = re.compile(r'http://[^>)"]+\.html|http://[^>)"]+\.htm|http://[^>)"]+\.jsp|http://[^>)"]+\.php')
23     url = pattern.findall(open(mainUrl).read())
24     print 'len(url):',len(url)
25     return url
26
27 #按url数组逐个抓取网页
28 def grabUrl(url,dir):
29     for index in range(0,len(url)):
30         try:
31             fileName = dir+'\%d.html'%index
32             f = open(fileName,'w')
33             print '正在抓取第%d个网页:'%index+url[index]
34             ftemp = urllib.urlopen(url[index],'w')
35             temp = ftemp.read().decode('gbk','ignore').encode('utf-8','ignore')
36             temp = temp.replace('gb2312','utf-8')                   
37             f.write(temp)
38             f.close()
39         except:
40             pass
41  
42
43
44 def spider(root,dir):
45     indexUrl = root
46     print '开始抓取根url:'
47     mainUrl = getMainUrl(dir,indexUrl)
48     print ' 抓取根url完成,开始分析url:'
49     url = analysisUrl(mainUrl)
50     print '分析url完成,开始抓取:'
51     grabUrl(url,dir)
52
53     count = 0
54     for index in url:
55         if not os.path.exists(dir+'\root%d'%count):
56             dir = dir+'root%d'%count
57             os.mkdir(dir)
58         indexUrl = index
59         print '开始抓取根url:'
60         mainUrl = getMainUrl(dir,indexUrl)
61         print ' 抓取根url完成,开始分析url:'
62         url = analysisUrl(mainUrl)
63         print '分析url完成,开始抓取:'
64         grabUrl(url,dir)
65        
66         count = count+1
67        
68     print '抓取完成'
69
70
71
72 def main():
73     dir = r'E:\gxwWebPage\xinhuanet'
74     root = r'http://www.xinhuanet.com/politics/'
75     spider(root,dir)
76        
77 if __name__ == '__main__':
78     main()


所有评论,共0条:( 我也来说两句)


发表评论

注册登录后再发表评论