#coding:utf-8 import selenium.webdriver from selenium.webdriver import Firefox,Chrome,Remote,DesiredCapabilities import urllib2 import re import sqlite3 import MySQLdb import time from fetchvc_noth import clear_idcache,cache_image,fetch from download import httpfetch username='wilken_h@163.com' password='weijian23885430' path = '/var/www/simplecd.old' conn = sqlite3.connect(path+'/verycd.sqlite3.db') conn.text_factory = str dbl = sqlite3.connect(path+'/lock.sqlite3.db') dbl.text_factory = str mysqldb = statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8') debug= True class adv_vc_fetcher: def __init__(self): self.br = Firefox() #self.br = Remote(desired_capabilities=DesiredCapabilities.HTMLUNITWITHJS) self.resp = None def timenow(self): return '['+time.strftime("%Y/%m/%d %H:%M:%S" ,time.gmtime(time.time()+3600*8))+']' def login(self): print self.timenow(),"getting homepage..." self.br.get("http://secure.verycd.com/signin") self.br.find_element("name","username").send_keys(username) self.br.find_element("name","password").send_keys(password) print self.timenow(),"submitting login info..." self.br.find_element("name","login_submit").click() def fetch(self,diff=3600): ididx = '' ididx2 = '' ididx3 = '' for i in range(1,int(diff)/3600/25+2): print self.timenow(),"fetching elite indexes..",i self.br.get("http://www.verycd.com/orz/page%d?stat=elite&orderby=edit"%i) ididx3 += self.br.get_page_source() ids3 = re.compile('topics/(\d+).*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(ididx3) ids3 = sorted( list(set(ids3)), cmp=lambda x,y:cmp(x[2],y[2]) ) self.save(ids3,diff) print self.timenow(),"fetching req indexes..",i self.br.get("http://www.verycd.com/orz/page%d?stat=requested&orderby=edit"%i) ididx += self.br.get_page_source() ids = re.compile('topics/(\d+).*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(ididx) ids = sorted( list(set(ids)), cmp=lambda x,y:cmp(x[2],y[2]) ) self.save(ids,diff) print self.timenow(),"fetching normal indexes..",i self.br.get("http://www.verycd.com/orz/page%d?stat=normal&orderby=edit"%i) ididx2 += self.br.get_page_source() ids2 = re.compile('topics/(\d+).*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(ididx2) ids2 = sorted( list(set(ids2)), cmp=lambda x,y:cmp(x[2],y[2]) ) self.save(ids2,diff) ids = re.compile('topics/(\d+).*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(ididx) ids2 = re.compile('topics/(\d+).*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(ididx2) ids3 = re.compile('topics/(\d+).*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(ididx3) ids = sorted( list(set(ids)), cmp=lambda x,y:cmp(x[2],y[2]) ) ids2 = sorted( list(set(ids2)), cmp=lambda x,y:cmp(x[2],y[2]) ) ids3 = sorted( list(set(ids3)), cmp=lambda x,y:cmp(x[2],y[2]) ) self.save(ids3,diff) self.save(ids2,diff) self.save(ids,diff) #self.judge(ids3,diff) #self.judge(ids,diff) #self.judge(ids2,diff) def save(self,ids,diff=3600): for vid,t1,t2 in ids: vid = int(vid) time_2hrs = time.strftime("%Y/%m/%d %H:%M:%S" ,time.gmtime(time.time()+3600*8-int(diff))) if t2=t2: continue open(path+"/tofetch","a").write(str(vid)+","+str(t2)+"\n") def judge(self,ids,diff=3600): for vid,t1,t2 in ids: vid = int(vid) time_2hrs = time.strftime("%Y/%m/%d %H:%M:%S" ,time.gmtime(time.time()+3600*8-int(diff))) if t2=t2: print self.timenow(),vid,"updated, skipping" continue print self.timenow(),"getting",vid,t2 #content = urllib2.urlopen("http://www.verycd.com/topics/%d/"%vid).read() #content = httpfetch("http://www.verycd.com/topics/%d/"%vid) #self.br.get("http://www.verycd.com/topics/%d/"%vid) #if u"\u6709\u9519\u8bef\u53d1\u751f" in self.br.title: #print self.timenow(),"cannot access" #print self.timenow(),"exiting process" #break #continue #content = self.br.get_page_source() #rtn = self.parse(vid,content.encode('utf-8'),t2) rtn = fetch(vid,updtime=t2) if not rtn: break def parse(self,id,res,updtime=None): abstract = re.compile(r']*>(.*?)',re.DOTALL).findall(abstract) if title: title=title[0] else: return try: status = re.compile(r'requestIcon"[^>]*>\s*]*>(.*?)<',re.DOTALL).search(abstract).group(1) brief = re.compile(r'摘要.*?(.*?)',re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip() pubtime = re.compile(r'时间.*?.*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(abstract)[0] if updtime: pubtime = list(pubtime) pubtime[1] = updtime category = re.compile(r'top;">分类.*?.*?top;">.*?>(.*?).*?>(.*?)',re.DOTALL).findall(abstract) if not category: category = re.compile(r'top;">分类.*?.*?top;">\s*(.*?)\s+(.*?)\s*',re.DOTALL).findall(abstract) category = list(category[0]) category[0] = category[0].replace(' ','').replace('\xc2\xa0','') category[1] = category[1].replace(' ','').replace('\xc2\xa0','') category[0] = category[0].strip() category[1] = category[1].strip() #ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)[^<]*>([^<]*)',re.DOTALL).findall(res) ed2k = [] ed2k.extend( re.compile(r'ed2k="([^"]*)"[^>]*>([^<]*)',re.DOTALL).findall(res) ) #delete duplicates newed2k = ed2k for i in range(len(ed2k)-1,-1,-1): if ed2k[i] in ed2k[:i]: newed2k.remove(ed2k[i]) content = re.compile(r'(
',re.DOTALL).findall(res) except Exception as what: print self.timenow(),'parse1',what return if content: content = content[0] content = re.compile(r'<(/?OBJECT.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(/?PARAM.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(/?EMBED.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(img .*?)>').sub(r'[\1]',content) content = re.compile(r'
',re.DOTALL).sub('\n',content) content = re.compile(r'<.*?>',re.DOTALL).sub('',content) content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content) content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content) content = re.compile(r'\[(img .*?)\]').sub(r'<\1>
',content) content = re.compile(r'\[(/?OBJECT.*?)\]').sub(r'<\1>',content) content = re.compile(r'\[(/?PARAM.*?)\]').sub(r'<\1>',content) content = re.compile(r'\[(/?EMBED.*?)\]').sub(r'<\1>',content) content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content) content = content.strip() imglinks = re.compile(r'src="(http://image-\d*\.app-base\.com/[^"]*)"',re.I).findall(content) try: cache_image(imglinks) except: pass else: content='' vcpv=0 try: staturl = 'http://stat.verycd.com/counters/folder/'+str(id)+'/' st = urllib2.urlopen(staturl).read() vcpv = int(re.compile(r'\'(\d+)\'').findall(st)[0]) except Exception as what: print self.timenow(),what vcpv = 0 try: c2 = statdb.cursor() c2.execute('select * from t1 where id=%s',(id,)) r = [x for x in c2.fetchall()] if r == []: # empty, insert c2.execute('insert into t1 (id,comments,hits,score,title,brief,category1,updtime,status,vcpv) values \ (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (id,0,0,0,title,brief,category[0],pubtime[1],status,vcpv) ) else: # update c2.execute('update t1 set vcpv=%s,status=%s where id=%s',(vcpv,status,id)) except Exception as what: print self.timenow(),what owner = re.compile(r'''"/members/@u\d+/">([^<]+)''',re.M).findall(res) if owner: owner = owner[0] cl=dbl.cursor() try: cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv)) except Exception,what: print self.timenow(),what while True: try: dbl.commit() break except Exception,what: print self.timenow(),what cl.close() print self.timenow(),title,status if debug == "impossible": if vcpv: print vcpv if owner: print owner print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' if ed2kstr == '': ed2kpage = re.compile(r'"(http://www\.verycd\.com/search/files/.*?rel)"').findall(res) if ed2kpage: url = ed2kpage[-1] self.br.get(url) ed2kpage = self.br.get_page_source() starts = re.compile(r'''javascript:generateUrl\('start',(\d+)\)''').findall(ed2kpage) for start in set(starts): self.br.get(url+"&start=%s"%start) ed2kpage += self.br.get_page_source() ed2kpage = ed2kpage.encode('utf-8') ed2ks = re.compile(r'ed2k://.*?\|/').findall(ed2kpage) ed2ks = set(ed2ks) ed2ks = sorted(list(ed2ks)) for ed2k in ed2ks: ed2kstr += ed2k + '`' + ed2k.split('|')[2] + '`' #print ed2kstr if not ed2kstr: return else: return clear_idcache(id) # mysql c = statdb.cursor() try: c.execute("replace into verycd values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'') ) statdb.commit() except Exception,what: print 'mysql',what c.close() #sqlite c = conn.cursor() try: c.execute("replace into verycd values (?,?,?,?,?,?,?,?,?,?,?)", (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'') ) conn.commit() except Exception,what: try: conn.commit() except: pass print 'sqlite3',what c.close() return True def debug(self): print self.br.get_page_source() def __del__(self): self.br.quit() if __name__=='__main__': f = adv_vc_fetcher() f.login() import os,sys if len(sys.argv)>1: f.fetch(sys.argv[1]) else: f.fetch(10800) #f.parse(5420,open("test").read())