#coding=utf-8
#需要BeautifulSoup(美丽的汤)支持:http://crummy.com/software/BeautifulSoup

import urllib
import urllib2
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup          # For processing HTML

def formalize(text):
    result = ''
    lines = text.split(u'\n')
    for line in lines:
        line = line.strip()
        if len(line) == 0:
            continue
        result += line + u'\n\n'
    return result

outfile = open("qiushi.txt", "w")
count = 0
for i in range(1, 101):
    url = "http://qiushibaike.com/qiushi/best/all/page/%d" % i
    data = urllib2.urlopen(url).readlines()
    soup = BeautifulSoup("".join(data))
    contents = soup.findAll('div', "content")
    stories = [str(text) for text in contents]
    for story in stories:
        count += 1
        print "processing page %d, %d items added" % (i, count)
        minisoup = BeautifulSoup(story)
        text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
        text = urllib.unquote(unescape(text, {'"':'"'}))
        text = formalize(text).encode("utf-8")
        print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
        print >> outfile, text + "\r\n"
outfile.close()