#!/usr/bin/env python
#coding: utf-8
#
# scdd.py daemon process
#
# author: observer
# email: jingchaohu@gmail.com
# blog: http://obmem.com
# last edit @ 2009.12.23
import os,sys,time
import re
#from daemon import Daemon
import sqlite3,MySQLdb
import fetchvc_noth as fetchvc
import fcmt_noth as fcmt
import douban
import download
import urllib2
from Queue import Queue
from threading import Thread,stack_size,Lock
from random import randint
from feed import feed
import memcache
mc = memcache.Client(['127.0.0.1:11211'], debug=0)
stack_size(32768*32)
dico = {}
class FetchManger:
def __init__(self,path):
self.path = path
self.q = Queue()
self.running = False
def ensure_dir(self,f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
def thread_fetch(self):
conn = sqlite3.connect(self.path+'/verycd.sqlite3.db')
conn2 = MySQLdb.connect(user='root',passwd='guess8',db='simplecd')
dbl = sqlite3.connect(self.path+'/lock.sqlite3.db')
statdb = conn2
conn.text_factory = str
conn2.text_factory = str
dbl.text_factory = str
#download.httpfetch('http://www.verycd.com',needlogin=False)
while True:
topic,updtime = self.q.get()
try:
#print 'fetching topic',topic,'...'
rtn = fetchvc.fetch(topic,conn=conn,dbl=dbl,statdb=statdb,needlogin=False,updtime=updtime)
if not rtn:
self.q.task_done()
print '__cannot access the resource'
continue
print 'fetching cmt',topic,'...'
fcmt.fetchcmt(topic,conn2,needlogin=False)
print 'fetching douban',topic,'...'
douban.douban(topic,conn=conn)
print 'fetching icon',topic,'...'
# cache the icon
global dico
if dico.has_key( int(topic) ):
url = dico[ int(topic) ]
ico = download.httpfetch(url,needlogin=False)
cpath = self.path+'/iconcache/'+str(topic)[:2]+'/'+str(topic)[2:4]+'/'+str(topic)+'.jpg'
self.ensure_dir(cpath)
if len(ico)>0:
open(cpath,'wb').write(ico)
else:
cpath = self.path+'/iconcache/'+str(topic)[:2]+'/'+str(topic)[2:4]+'/'+str(topic)+'.jpg'
if not (os.path.exists(cpath) and len(open(cpath,'rb').read())>5):
c = conn.cursor()
c.execute( 'select content from verycd where verycdid=?',(int(topic),) )
content = c.fetchone()[0]
c.close()
imgurl = re.compile(r'
(.*?).*?(.*?)',re.DOTALL).findall(idx)
updts = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(updts)
if updts:
updts = updts[0]
icons = re.compile(r'/topics/(\d+).*?
',re.DOTALL).findall(updts)
for x in icons:
dico[ int(x[0]) ] = x[1]
ids = sorted(list(set(ids)),cmp=lambda x,y:cmp(x[2],y[2]))
c = conn.cursor()
for id,pubtime,updtime in ids:
if last_updtime < updtime:
c.execute("select updtime from verycd where verycdid=?",(id,))
lupdtime = c.fetchone()
if lupdtime and lupdtime[0]>=updtime:
print "not updating",id,updtime
continue
print "updating",id
self.q.put([id,updtime])
if lupdtime:
open("/tmp/simplecfetch.log","a").write("%s,%s,%s>%s\n"%(timenow,id,updtime,lupdtime[0]))
else:
open("/tmp/simplecfetch.log","a").write("%s,%s,%s\n"%(timenow,id,updtime))
# update1,disabled
if timesec%3888<0:
self.runthread()
download.httpfetch('http://www.verycd.com',needlogin=False)
url = 'http://www.verycd.com/sto/page1'
updts = download.httpfetch(url,needlogin=False)
updts = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(updts)
if updts:
updts = updts[0]
icons = re.compile(r'/topics/(\d+).*?
',re.DOTALL).findall(updts)
for x in icons:
dico[ int(x[0]) ] = x[1]
self.q.put([int(x[0]),None])
if True:
self.runthread()
download.httpfetch('http://www.verycd.com',needlogin=False)
timenow = time.strftime("%Y/%m/%d %H:%M:%S" ,time.gmtime(time.time()+3600*8))
ids=open(path+"/tofetch").read().split("\n")
c = conn.cursor()
for x in set(ids):
try:
id,updtime = x.split(',')
except:
continue
id = int(id)
c.execute("select updtime from verycd where verycdid=?",(id,))
lupdtime = c.fetchone()
if lupdtime and lupdtime[0]>=updtime:
print "not updating",id,updtime
continue
print "updating",id
self.q.put([id,updtime])
if lupdtime:
open("/tmp/simplecfetch.log","a").write("%s,%s,%s>%s\n"%(timenow,id,updtime,lupdtime[0]))
else:
open("/tmp/simplecfetch.log","a").write("%s,%s,%s\n"%(timenow,id,updtime))
#open(path+"/tofetch","w").write("")
except Exception as what:
print what
self.q.join()
if __name__ == "__main__":
path = os.path.dirname(os.path.realpath(sys.argv[0]))
manager = FetchManger(path=path)
manager.fetch()