#!/usr/bin/env python #coding: utf-8 # # scdd.py daemon process # # author: observer # email: jingchaohu@gmail.com # blog: http://obmem.com # last edit @ 2009.12.23 import os,sys,time import re #from daemon import Daemon import sqlite3,MySQLdb import fetchvc_noth as fetchvc import fcmt_noth as fcmt import douban import download import urllib2 from Queue import Queue from threading import Thread,stack_size,Lock from random import randint from feed import feed import memcache mc = memcache.Client(['127.0.0.1:11211'], debug=0) stack_size(32768*32) dico = {} class FetchManger: def __init__(self,path): self.path = path self.q = Queue() self.running = False def ensure_dir(self,f): d = os.path.dirname(f) if not os.path.exists(d): os.makedirs(d) def thread_fetch(self): conn = sqlite3.connect(self.path+'/verycd.sqlite3.db') conn2 = MySQLdb.connect(user='root',passwd='guess8',db='simplecd') dbl = sqlite3.connect(self.path+'/lock.sqlite3.db') statdb = conn2 conn.text_factory = str conn2.text_factory = str dbl.text_factory = str #download.httpfetch('http://www.verycd.com',needlogin=False) while True: topic,updtime = self.q.get() try: #print 'fetching topic',topic,'...' rtn = fetchvc.fetch(topic,conn=conn,dbl=dbl,statdb=statdb,needlogin=False,updtime=updtime) if not rtn: self.q.task_done() print '__cannot access the resource' continue print 'fetching cmt',topic,'...' fcmt.fetchcmt(topic,conn2,needlogin=False) print 'fetching douban',topic,'...' douban.douban(topic,conn=conn) print 'fetching icon',topic,'...' # cache the icon global dico if dico.has_key( int(topic) ): url = dico[ int(topic) ] ico = download.httpfetch(url,needlogin=False) cpath = self.path+'/iconcache/'+str(topic)[:2]+'/'+str(topic)[2:4]+'/'+str(topic)+'.jpg' self.ensure_dir(cpath) if len(ico)>0: open(cpath,'wb').write(ico) else: cpath = self.path+'/iconcache/'+str(topic)[:2]+'/'+str(topic)[2:4]+'/'+str(topic)+'.jpg' if not (os.path.exists(cpath) and len(open(cpath,'rb').read())>5): c = conn.cursor() c.execute( 'select content from verycd where verycdid=?',(int(topic),) ) content = c.fetchone()[0] c.close() imgurl = re.compile(r'(.*?).*?(.*?)',re.DOTALL).findall(idx) updts = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(updts) if updts: updts = updts[0] icons = re.compile(r'/topics/(\d+).*?',re.DOTALL).findall(updts) for x in icons: dico[ int(x[0]) ] = x[1] ids = sorted(list(set(ids)),cmp=lambda x,y:cmp(x[2],y[2])) c = conn.cursor() for id,pubtime,updtime in ids: if last_updtime < updtime: c.execute("select updtime from verycd where verycdid=?",(id,)) lupdtime = c.fetchone() if lupdtime and lupdtime[0]>=updtime: print "not updating",id,updtime continue print "updating",id self.q.put([id,updtime]) if lupdtime: open("/tmp/simplecfetch.log","a").write("%s,%s,%s>%s\n"%(timenow,id,updtime,lupdtime[0])) else: open("/tmp/simplecfetch.log","a").write("%s,%s,%s\n"%(timenow,id,updtime)) # update1,disabled if timesec%3888<0: self.runthread() download.httpfetch('http://www.verycd.com',needlogin=False) url = 'http://www.verycd.com/sto/page1' updts = download.httpfetch(url,needlogin=False) updts = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(updts) if updts: updts = updts[0] icons = re.compile(r'/topics/(\d+).*?',re.DOTALL).findall(updts) for x in icons: dico[ int(x[0]) ] = x[1] self.q.put([int(x[0]),None]) if True: self.runthread() download.httpfetch('http://www.verycd.com',needlogin=False) timenow = time.strftime("%Y/%m/%d %H:%M:%S" ,time.gmtime(time.time()+3600*8)) ids=open(path+"/tofetch").read().split("\n") c = conn.cursor() for x in set(ids): try: id,updtime = x.split(',') except: continue id = int(id) c.execute("select updtime from verycd where verycdid=?",(id,)) lupdtime = c.fetchone() if lupdtime and lupdtime[0]>=updtime: print "not updating",id,updtime continue print "updating",id self.q.put([id,updtime]) if lupdtime: open("/tmp/simplecfetch.log","a").write("%s,%s,%s>%s\n"%(timenow,id,updtime,lupdtime[0])) else: open("/tmp/simplecfetch.log","a").write("%s,%s,%s\n"%(timenow,id,updtime)) #open(path+"/tofetch","w").write("") except Exception as what: print what self.q.join() if __name__ == "__main__": path = os.path.dirname(os.path.realpath(sys.argv[0])) manager = FetchManger(path=path) manager.fetch()