出于对百度诱骗用户升级百度空间的不满(新空间很烂),我想备份自己的2600多篇博客到本地,以后迁移或者给自己留个念想。写这篇博客的时候,搜了一下百度,只有一篇是用v2登陆百度的,用python写的。我运行了一下,不行,然后还是自己动手丰衣足食吧。
我不得不登陆百度来抓取博客,因为有些文章是仅自己可见的。上一次用python登陆百度似乎是五六年前了,那时候登陆的链接是http://passport.baidu.com/?login,现在变成了http://passport.baidu.com/v2/api/?login,百度进步了一点点。
登陆百度的原理:
1、访问https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=false,生成一个cookie
2、再次访问https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=false,获取Token并保存cookie
3、构造Post参数(主要是用户名、密码和Token),访问http://passport.baidu.com/v2/api/?login
4、登陆之后,基本上想干啥就干啥了,我主要是用来下载我自己的百度博客。不过要注意不要访问太密被百度封了。
大家可以直接用下面这份代码或者改进作为迁移博客等其他用途。
#!/usr/bin/python
#coding:utf8
#Author = yyobin@gmail.com
#Create = 20120517
import cookielib, urllib2, urllib
import os,sys,socket,re
#解析有多少页博客
pageStr = """var PagerInfo = {allCount : '(\d+)',pageSize : '(\d+)',curPage : '\d+'};"""
pageObj = re.compile(pageStr, re.DOTALL)
#获取登陆token
login_tokenStr = '''bdPass.api.params.login_token='(.*?)';'''
login_tokenObj = re.compile(login_tokenStr,re.DOTALL)
#获取博客标题和url
blogStr = r'''<div class="hide q-username"><a href=".*?" class=a-normal target=_blank>.*?</a></div><a href="(.*?)" class="a-incontent a-title" target=_blank>(.*?)</a></div><div class=item-content>'''
blogObj = re.compile(blogStr,re.DOTALL)
class Baidu(object):
def __init__(self,user = '', psw = '', blog = ''):
self.user = user#暂未考虑中文ID
self.psw = psw
self.blog = blog
if not user or not psw or not blog:
print "Plz enter enter 3 params:user,psw,blog"
sys.exit(0)
if not os.path.exists(self.user):
os.mkdir(self.user)
self.cookiename = 'baidu%s.coockie' % (self.user)
self.token = ''
self.allCount = 0
self.pageSize = 10
self.totalpage = 0
self.logined = False
self.cj = cookielib.LWPCookieJar()
try:
self.cj.revert(self.cookiename)
self.logined = True
print "OK"
except Exception,e:
print e
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
self.opener.addheaders = [('User-agent','Opera/9.23')]
urllib2.install_opener(self.opener)
socket.setdefaulttimeout(30)
#登陆百度
def login(self):
#如果没有获取到cookie,就模拟登陆一下
if not self.logined:
print "need logon"
#第一次访问一下,目的是为了先保存一个cookie下来
qurl = '''https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=false'''
r = self.opener.open(qurl)
self.cj.save(self.cookiename)
#第二次访问,目的是为了获取token
qurl = '''https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=false'''
r = self.opener.open(qurl)
rsp = r.read()
self.cj.save(self.cookiename)
#通过正则表达式获取token
matched_objs = login_tokenObj.findall(rsp)
if matched_objs:
self.token = matched_objs[0]
print self.token
#然后用token模拟登陆
post_data = urllib.urlencode({'username':self.user,
'password':self.psw,
'token':self.token,
'charset':'UTF-8',
'callback':'parent.bd12Pass.api.login._postCallback',
'index':'0',
'isPhone':'false',
'mem_pass':'on',
'loginType':'1',
'safeflg':'0',
'staticpage':'https://passport.baidu.com/v2Jump.html',
'tpl':'mn',
'u':'http://www.baidu.com/',
'verifycode':'',
})
#path = 'http://passport.baidu.com/?login'
path = 'http://passport.baidu.com/v2/api/?login'
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
self.opener.addheaders = [('User-agent','Opera/9.23')]
urllib2.install_opener(self.opener)
headers = {
"Accept": "image/gif, */*",
"Referer": "https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F",
"Accept-Language": "zh-cn",
"Content-Type": "application/x-www-form-urlencoded",
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Host": "passport.baidu.com",
"Connection": "Keep-Alive",
"Cache-Control": "no-cache"
}
req = urllib2.Request(path,
post_data,
headers=headers,
)
rsp = self.opener.open(req).read()
#如果觉得有必要的话,在这里自己读一下rsp判断一下是否登陆OK,我打印过登陆没问题
self.cj.save(self.cookiename)
else:
print "Login Fail"
sys.exit(0)
#获取博客一共有多少页,如果有私有博客的话,登陆和不登陆获取的是不一样的
def getTotalPage(self):
#获取博客的总页数
req2 = urllib2.Request(self.blog)
rsp = urllib2.urlopen(req2).read()
if rsp:
rsp = rsp.replace('\r','').replace('\n','').replace('\t','')
matched_objs = pageObj.findall(rsp)
if matched_objs:
obj0,obj1 = matched_objs[0]
self.allCount = int(obj0)
self.pageSize = int(obj1)
self.totalpage = (self.allCount / self.pageSize) + 1
print self.allCount,self.pageSize,self.totalpage
#获取每一页里的博客链接
def fetchPage(self,url):
req = urllib2.Request(url)
rsp = urllib2.urlopen(req).read()
if rsp:
rsp = rsp.replace('\r','').replace('\n','').replace('\t','')
matched_objs = blogObj.findall(rsp)
if matched_objs:
for obj in matched_objs:
#这里可以用多线程改写一下,单线程太慢
self.download(obj[0],obj[1])
def downloadBywinget(self,url,title):
pass#比如使用wget之类的第三方工具,自己填参数写
#下载博客
def download(self,url,title):
path = '%s/%s.html' % (self.user,title.decode('utf-8'))
url = 'http://hi.baidu.com%s' % (url)
print "Download url %s" % (url)
nFail = 0
while nFail < 5:
try:
sock = urllib.urlopen(url)
htmlSource = sock.read()
myfile = file(path,'w')
myfile.write(htmlSource)
myfile.close()
sock.close()
return
except:
nFail += 1
print 'download blog fail:%s' % (url)
def dlownloadall(self):
for page in range(1,self.totalpage+1):
url = "%s?page=%d" % (self.blog,page)
#这里可以用多线程改写一下,单线程太慢
self.fetchPage(url)
def main():
user = 'yyobin' #你的百度登录名
psw = '***********' #你的百度登陆密码,不输入用户名和密码,得不到私有的文章
blog = "http://hi.baidu.com/new/yobin" #你自己的百度博客链接
baidu = Baidu(user,psw,blog)
baidu.login()
baidu.getTotalPage()
baidu.dlownloadall()
if __name__ == '__main__':
main()
...