#!/usr/bin/env python #coding: utf-8 import urllib2,urllib import re import sqlite3 import os,sys import time db = sqlite3.connect('/var/www/simplecd.old/verycd.sqlite3.db') db.text_factory = str path = os.path.dirname(os.path.realpath(sys.argv[0])) #proxies = {'http':'http://vpn.unswbbs.com:3128'} proxies = {'http':'http://localhost:3128'} #proxies = {'http':'http://obmem.com:30000'} proxy_support = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) def clear_idcache(id): #clear idcache: try: l1 = str(long(id)%10) l2 = str(long(id)/10%100) l3 = str(long(id)/1000) cachefile = path + '/idcache/%s/%s/%s.html'%(l1,l2,l3) if os.path.exists(cachefile): os.remove(cachefile) except: pass def doubanapi(id,conn=db): apikey = '047ca5407fe8b8a40d75efc17fba29ec' title = '幸运星' cat = 'movie' url = 'http://api.douban.com/%s/subjects?q=%s&start-index=1&max-results=1&alt=json&apikey=%s'%(cat,title,apikey) print url import json dbj = urllib2.urlopen(url).read() dbj = json.loads(dbj) for x in dbj['entry'][0].keys(): print x,dbj['entry'][0][x] print dbj['entry'][0]['title']['$t'] def douban(id,conn=db): print 'updating',id id = long(id) c = conn.cursor() c.execute('select title,category1,brief,content from verycd where verycdid=?',(id,)) (title,cat,brief,content) = c.fetchone() c.close() dbed = re.compile('Douban\d+').search(brief) imdbid = re.compile(r'http://www\.imdb\.com/title/(tt\d+)').search(content) if imdbid: imdbid = imdbid.group(1) print imdbid if (cat not in ['电影','音乐','剧集','动漫','综艺']) or dbed: print '...not movie or already updated' clear_idcache(id) return if title.find("《") == -1: name = title else: name = title[ title.find("《")+3: title.find("》") ] if cat == '音乐': durl = 'http://www.douban.com/subject_search?search_text='+name+'&cat=1003' elif imdbid: durl = 'http://www.douban.com/subject_search?search_text='+imdbid+'&cat=1002' else: durl = 'http://www.douban.com/subject_search?search_text='+name+'&cat=1002' durl = durl.replace(' ','+') print '...fetching douban page',name print durl try: ds = urllib2.urlopen(durl).read() except: print '...fetch error' return dbscore = re.compile(r'href=".*?/subject/(\d+)/".*?"rating_nums">([0-9.]+)<.*?(\d+)',re.DOTALL).search(ds) if not dbscore: print '...not found' return ins = brief.find('votes)') dbb = 'Douban%s:%s(%s votes)'%(dbscore.group(1),dbscore.group(2),dbscore.group(3)) if ins == -1: brief = dbb + brief else: brief = brief[:ins+6]+dbb+brief[ins+6:] c = conn.cursor() try: c.execute('update verycd set brief=? where verycdid=?',(brief,id)) c.close() conn.commit() except: c.close() conn.commit() print '...db update failure' return print '...updated',dbb clear_idcache(id) return True def updateall(): c = db.cursor() c.execute('select verycdid from verycd where category1="电影" and verycdid>6000 order by updtime desc') ids = c.fetchall() for id in ids: update = douban(id[0]) if update: time.sleep(3) def update(num=20): c = db.cursor() c.execute('select verycdid from verycd order by updtime desc limit ?',(num,)) ids = c.fetchall() for id in ids: douban(id[0]) if __name__ == '__main__': update(20) #douban(2802148) #updateall()