周六写了个脚本,抓取了某网站的Flash文件,一共抓取了8000个左右,硬盘不多了,唉。针对特定网站抓取文字或文件,对我来说犹如探囊之物,都没啥挑战性。脚本大概就是下面这样子的(隐去了网站的名字),感兴趣可以看看,有问题可以探讨。
# -*- coding: cp936 -*-
# Filename: getflash.py
import urllib
import winsound
import re
import os
from sgmllib import SGMLParser
from threadpool import WorkerManager
###==================================================================
#打开一个网页,获取网页内容
def get_url_data(url):
nFail = 0
while nFail < 5:
try:
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
return htmlSource
except:
nFail += 1
print "get url fail:%s" % (url)
return None
###==================================================================
#使用SGMLParser解析分类网页,获得想要的url list
class classLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.bScope = 0
self.urls = []
def start_tr(self, attrs):
if self.bScope == 1:
self.bScope = 2
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
if href == ['/index.htm']:
self.bScope = 1
if self.bScope == 2:
self.urls.extend(href)
###==================================================================
#使用正则表达式解析特定网页,获得最终要下载文件的url
def get_swfurl(pageurl):
htmlContent = get_url_data(pageurl)
rawstr = r'<!DOCTYPE.*var str\d+ = '
compile_obj = re.compile(rawstr, re.DOTALL)
matchstr = htmlContent
CleanPageContent = compile_obj.subn('', matchstr)
rawstr = r";.*</HTML>"
compile_obj = re.compile(rawstr, re.DOTALL)
matchstr = CleanPageContent[0]
CleanPageContent = compile_obj.subn('', matchstr)
return CleanPageContent[0].replace('"','')
###==================================================================
#下载文件
def download_swf(url,path,count):
path += '/'
url = 'http://www.4399.com' + url
#print "url =",url
swfurl = 'http://www.******.com/***' + get_swfurl(url)#具体的Flash url
filename = path + "%d_" %(count)
mylist = swfurl.split('/')
filename += mylist[-1]
print filename
nFail = 0
while nFail < 10:
try:
urllib.urlretrieve(swfurl, filename)
return True
except:
nFail += 1
print "get url fail:%s" % (swfurl)
return None
###==================================================================
def parse_swfClass():
count = 0 #防止重名,每个文件前加上一个独一无二的id标识
wm = WorkerManager(500)
for loop in range(1,17):
if loop == 10 or loop == 15:
continue
path = "%d" % (loop)
if not os.path.exists(path):
os.mkdir(path)
indexurl = r'http://www.********.com/******/%d_1.htm' % (loop)
print indexurl
htmlContent = get_url_data(indexurl).replace('<br/>','')
if htmlContent == None:
continue
parser = classLister()
parser.feed(htmlContent)
parser.close()
for url in parser.urls:
count += 1
wm.add_job(download_swf,url,path,count)
wm.wait_for_complete()
if __name__ == "__main__":
print "gogogo"
parse_swfClass()
print "finished"
winsound.Beep(783,200)
winsound.Beep(783,200)
...