#coding:utf-8 import selenium.webdriver import re import sqlite3 import MySQLdb import time from fetchvc_noth import clear_idcache,cache_image,fetch #import fetchvc username='wilken_h@163.com' password='weijian23885430' path = '/var/www/simplecd.old' conn = sqlite3.connect(path+'/verycd.sqlite3.db') conn.text_factory = str dbl = sqlite3.connect(path+'/lock.sqlite3.db') dbl.text_factory = str mysqldb = statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8') debug= True class adv_vc_fetcher: def __init__(self): #self.br = selenium.webdriver.firefox.webdriver.WebDriver() #self.br.set_proxies({'http':'127.0.0.1:3128'}) #self.br.set_handle_robots(False) self.resp = None def login(self): print "getting homepage..." #self.br.get("http://secure.verycd.com/signin") #self.br.find_element("name","username").send_keys(username) #self.br.find_element("name","password").send_keys(password) print "submitting login info..." #self.br.find_element("name","login_submit").click() def fetch(self,diff=3600): c = mysqldb.cursor() c.execute('select verycdid from verycd') allids = [ x[0] for x in c.fetchall() ] ididx = '' ididx = open("/root/recent").readlines() idss = [ x.strip().split(',')[1:] for x in ididx ] for ids in idss[2263:]: for vid in ids: try: vid = int(vid) except: continue if vid in allids: print vid,'exists' continue print "getting id",vid #fetchvc.q.put(vid) fetch(vid,conn=conn,dbl=dbl,statdb=statdb,debug=True,cache=False) def parse(self,id,res): abstract = re.compile(r']*>(.*?)',re.DOTALL).findall(abstract) if title: title=title[0] else: return try: status = re.compile(r'requestIcon"[^>]*>\s*]*>(.*?)<',re.DOTALL).search(abstract).group(1) brief = re.compile(r'摘要.*?(.*?)',re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip() pubtime = re.compile(r'时间.*?.*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(abstract)[0] category = re.compile(r'top;">分类.*?.*?top;">.*?>(.*?).*?>(.*?)',re.DOTALL).findall(abstract) if not category: category = re.compile(r'top;">分类.*?.*?top;">\s*(.*?)\s+(.*?)\s*',re.DOTALL).findall(abstract) category = list(category[0]) category[0] = category[0].replace(' ','').replace('\xc2\xa0','') category[1] = category[1].replace(' ','').replace('\xc2\xa0','') category[0] = category[0].strip() category[1] = category[1].strip() #ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)[^<]*>([^<]*)',re.DOTALL).findall(res) ed2k = [] ed2k.extend( re.compile(r'ed2k="([^"]*)"[^>]*>([^<]*)',re.DOTALL).findall(res) ) #delete duplicates newed2k = ed2k for i in range(len(ed2k)-1,-1,-1): if ed2k[i] in ed2k[:i]: newed2k.remove(ed2k[i]) content = re.compile(r'(
',re.DOTALL).findall(res) except Exception as what: print 'parse1',what return if content: content = content[0] content = re.compile(r'<(/?OBJECT.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(/?PARAM.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(/?EMBED.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(img .*?)>').sub(r'[\1]',content) content = re.compile(r'
',re.DOTALL).sub('\n',content) content = re.compile(r'<.*?>',re.DOTALL).sub('',content) content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content) content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content) content = re.compile(r'\[(img .*?)\]').sub(r'<\1>
',content) content = re.compile(r'\[(/?OBJECT.*?)\]').sub(r'<\1>',content) content = re.compile(r'\[(/?PARAM.*?)\]').sub(r'<\1>',content) content = re.compile(r'\[(/?EMBED.*?)\]').sub(r'<\1>',content) content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content) content = content.strip() imglinks = re.compile(r'src="(http://image-\d*\.app-base\.com/[^"]*)"',re.I).findall(content) try: cache_image(imglinks) except: pass else: content='' vcpv=0 owner = re.compile(r'''"/members/@u\d+/">([^<]+)''',re.M).findall(res) if owner: owner = owner[0] cl=dbl.cursor() try: cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv)) except Exception,what: print what while True: try: dbl.commit() break except Exception,what: print what cl.close() if debug: if vcpv: print vcpv if owner: print owner print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' if ed2kstr == '': ed2kpage = re.compile(r'"(http://www\.verycd\.com/search/files/.*?rel)"').findall(res) if ed2kpage: url = ed2kpage[-1] self.br.get(url) ed2kpage = self.br.get_page_source() starts = re.compile(r'''javascript:generateUrl\('start',(\d+)\)''').findall(ed2kpage) for start in set(starts): self.br.get(url+"&start=%s"%start) ed2kpage += self.br.get_page_source() ed2kpage = ed2kpage.encode('utf-8') ed2ks = re.compile(r'ed2k://.*?\|/').findall(ed2kpage) ed2ks = set(ed2ks) ed2ks = sorted(list(ed2ks)) for ed2k in ed2ks: ed2kstr += ed2k + '`' + ed2k.split('|')[2] + '`' print ed2kstr if not ed2kstr: return else: return clear_idcache(id) # mysql c = statdb.cursor() try: c.execute("replace into verycd values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'') ) statdb.commit() except Exception,what: print 'mysql',what c.close() #sqlite c = conn.cursor() try: c.execute("replace into verycd values (?,?,?,?,?,?,?,?,?,?,?)", (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'') ) conn.commit() except Exception,what: conn.commit() print 'sqlite3',what c.close() def debug(self): print self.br.get_page_source() if __name__=='__main__': f = adv_vc_fetcher() f.login() import os,sys if len(sys.argv)>1: f.fetch(sys.argv[1]) else: f.fetch(2400) #f.parse(5420,open("test").read())