第一次写,写得很不规范,只是为了自己能用就行了,写代码的乐趣是难以描绘在纸上的,写完代码后的结果也确实能帮助人极大幅度地减少劳动量。
没有保存网页到本地,直接在网上读,俺用这个脚本去抓300多个网页的数据不到4分钟就搞定了,很快。以后网上再有人招从网页抓取内容保存到本地的,俺就可以去应征了,哈哈。
俺写了两个文件,其实写在一个文件里也可以,只不过想重用性高一些。源码如下,献丑了。
gethtml.py
#!/usr/bin/python
# -*- coding: cp936 -*-
# Filename: gethtml.py
import urllib, urllister,winsound
def get_urls():
str = "http://www.×××××××.com/×××××/12_P%d" #网页网址的前半部分啦,自己改
groupid = range(1, 498) #抓取第一到第498页
groupurls = []
print "General url begins."
for i in groupid:
tmp = str % i
groupurls.append(tmp)
else:
print "General url ends."
return groupurls
print "Program begins."
file_dest = "result.txt"
myurls = get_urls()
for myurl in myurls:
usock = urllib.urlopen(myurl)
parser = urllister.classLister()
parser.feed(usock.read())
usock.close()
parser.close()
for myname in parser.myname:
# print myname
try:
f = file(file_dest, 'a')
f.write(myname)
except:
print "write file error!!!"
pass
f.close()
else:
print "Congraduatons, download finied."
winsound.Beep(783,200)
winsound.Beep(783,200)
winsound.Beep(783,200)
第二个文件urllister.py,继承重载了SGMLParser,根据自己的需求写了一个类,想抓啥内容就在这个文件写就好。
#!/usr/bin/python
# -*- coding: cp936 -*-
# Filename: urllister.py
import sys
type = sys.getfilesystemencoding()
from sgmllib import SGMLParser
class classLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.is_url=""
self.is_shopname=""
self.is_Contant=""
self.urls = []
self.myname=[]
self.shopname = "ShopName"
self.shopContent = "ShopContent"
def start_div(self, attrs):
href = [v for k, v in attrs if k=='class']
if href:
if href[0] == self.shopContent:
self.is_Contant=1
def end_div(self):
self.is_url=""
self.is_Contant=""
def start_p(self, attrs):
href = [v for k, v in attrs if k=='class']
if href:
if href[0] == self.shopname:
self.is_shopname=1
def end_p(self):
self.is_url=""
self.myclass=[]
self.is_shopname=""
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
self.is_url=1
def end_a(self):
self.is_url=""
def handle_data(self, text):
if self.is_shopname:
if self.is_url:
text = text.decode('UTF-8').encode(type)
self.myname.append(text)
if self.is_Contant:
text = text.decode('UTF-8').encode(type)
self.myname.append(text)
...