周六写了个脚本,抓取了某网站的Flash文件,一共抓取了8000个左右,硬盘不多了,唉。针对特定网站抓取文字或文件,对我来说犹如探囊之物,都没啥挑战性。脚本大概就是下面这样子的(隐去了网站的名字),感兴趣可以看看,有问题可以探讨。

# -*- coding: cp936 -*-
# Filename: getflash.py

import urllib
import winsound
import re
import os
from sgmllib import SGMLParser
from threadpool import WorkerManager

###==================================================================
#打开一个网页,获取网页内容
def get_url_data(url):
    nFail = 0
    while nFail < 5:
        try:
            sock = urllib.urlopen(url)
            htmlSource = sock.read()
            sock.close()
            return htmlSource
        except:
            nFail += 1
    print "get url fail:%s" % (url)
    return None

###==================================================================
#使用SGMLParser解析分类网页,获得想要的url list
class classLister(SGMLParser):
    def reset(self):         
        SGMLParser.reset(self)
        self.bScope = 0
        self.urls = []

    def start_tr(self, attrs):
        if self.bScope == 1:
            self.bScope = 2

    def start_a(self, attrs):
        href = [v for k, v in attrs if k=='href']
        if href:
            if href == ['/index.htm']:
                self.bScope = 1
               
            if self.bScope == 2:
                self.urls.extend(href)

###==================================================================
#使用正则表达式解析特定网页,获得最终要下载文件的url
def get_swfurl(pageurl):
    htmlContent = get_url_data(pageurl)

    rawstr = r'<!DOCTYPE.*var str\d+ = '
    compile_obj = re.compile(rawstr, re.DOTALL)
    matchstr = htmlContent
    CleanPageContent = compile_obj.subn('', matchstr)

    rawstr = r";.*</HTML>"
    compile_obj = re.compile(rawstr, re.DOTALL)
    matchstr = CleanPageContent[0]
    CleanPageContent = compile_obj.subn('', matchstr)

    return CleanPageContent[0].replace('"','')


###==================================================================
#下载文件
def download_swf(url,path,count):
    path += '/'
    url = 'http://www.4399.com' + url
    #print "url =",url
    swfurl = 'http://www.******.com/***' + get_swfurl(url)#具体的Flash url
    filename = path + "%d_" %(count)

    mylist = swfurl.split('/')
    filename += mylist[-1]

    print filename
   
    nFail = 0
    while nFail < 10:
        try:
            urllib.urlretrieve(swfurl, filename)
            return True
        except:
            nFail += 1
    print "get url fail:%s" % (swfurl)
    return None   

   

###==================================================================
def parse_swfClass():
    count = 0 #防止重名,每个文件前加上一个独一无二的id标识
    wm = WorkerManager(500)
    for loop in range(1,17):
        if loop == 10 or loop == 15:
            continue
            
        path = "%d" % (loop)
        if not os.path.exists(path):
            os.mkdir(path)
       
        indexurl = r'http://www.********.com/******/%d_1.htm' % (loop)
        print indexurl
        htmlContent = get_url_data(indexurl).replace('<br/>','')
        if htmlContent == None:
            continue
        parser = classLister()
        parser.feed(htmlContent)
        parser.close()
        for url in parser.urls:
            count += 1
            wm.add_job(download_swf,url,path,count)
    wm.wait_for_complete()

if __name__ == "__main__":
    print "gogogo"
    parse_swfClass()
    print "finished"

    winsound.Beep(783,200)
    winsound.Beep(783,200)