#!/usr/bin/env python # -*- coding: utf-8 -*- # # fetchvc.py fetch resources from verycd # # author: observer # email: jingchaohu@gmail.com # blog: http://obmem.com # last edit @ 2009.12.23 import urllib,urllib2 import re import sqlite3 import time import os,sys import MySQLdb from threading import Thread,Lock,stack_size from Queue import Queue import download #path = os.path.dirname(os.path.realpath(sys.argv[0])) path = '/var/www/simplecd.old' conn = sqlite3.connect(path+'/verycd.sqlite3.db') dbl = sqlite3.connect(path+'/lock.sqlite3.db') mysqldb = statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8') dbl.text_factory = str conn.text_factory = str q = Queue() MAXC = 8 g_mutex = Lock() stack_size(32768*32) def thread_fetch(): conn = sqlite3.connect(path+'/verycd.sqlite3.db') conn.text_factory = str dbl = sqlite3.connect(path+'/lock.sqlite3.db') dbl.text_factory = str statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8') download.httpfetch('http://www.verycd.com',needlogin=True) while True: topic = q.get() fetch(topic,conn=conn,dbl=dbl,statdb=statdb) q.task_done() def search(keyword,full=True): '''search verycd, fetch search results''' searchlog = path+'/search.log' open(searchlog,'a').write('\n'+keyword+'\n') url = 'http://www.verycd.com/search/folders/'+keyword print 'fetching search results ...' res = download.httpfetch(url) topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res) links = [] if full: links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res) print links print topics if topics: for topic in topics: open(searchlog,'a').write(topic+',') q.put(topic) if full and links: for key in links: search(key,full=False) def hot(): ''' read verycd hot res and keep update very day ''' url = 'http://www.verycd.com/' print 'fetching homepage ...' home = download.httpfetch(url) hotzone = re.compile(r'热门资源.*?',re.DOTALL).search(home).group() hot = re.compile(r']*>(《.*?》)[^<]*',re.DOTALL).findall(hotzone) hot = [ x for x in hot ] html = '

每日热门资源

\n' html += '\n' for topic in hot[:9]: print 'fetching hot topic',topic[0],'...' q.put(topic[0]) html += '\n' % (topic[0],topic[0],topic[1]) html += '
\n' open(path+'/static/hot.html','w').write(html) def normal(pages): '''fetch normal res that need login''' if '-' in pages: (f,t)=[ int(x) for x in pages.split('-') ] else: f = t = int(pages) download.httpfetch('http://www.verycd.com',needlogin=True) for page in range(f,t+1): url = 'http://www.verycd.com/orz/page%d?stat=normal' % page idx = download.httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id) def request(pages): '''fetch request res that need login''' if '-' in pages: (f,t)=[ int(x) for x in pages.split('-') ] else: f = t = int(pages) download.httpfetch('http://www.verycd.com',needlogin=True) for page in range(f,t+1): url = 'http://www.verycd.com/orz/page%d?stat=request' % page idx = download.httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id) def all(pages): '''fetch request res that need login''' if '-' in pages: (f,t)=[ int(x) for x in pages.split('-') ] else: f = t = int(pages) for page in range(f,t+1): url = 'http://www.verycd.com/orz/page%d?stat=all' % page idx = download.httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id) def feed(): ''' read verycd feed and keep update very 30 min ''' url = 'http://www.verycd.com/sto/feed' print 'fetching feed ...' feeds = download.httpfetch(url) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds) ids = set(ids) print ids now = time.mktime(time.gmtime()) for id in ids: q.put(id) #updtime = fetch(id) #updtime = time.mktime(time.strptime(updtime,'%Y/%m/%d %H:%M:%S'))-8*3600 #gmt+8->gmt #diff = now - updtime #print '%10s secs since update' % (diff) #if diff > 1900: # only need recent 30min updates # break def update(num=10,off=1): urlbase = 'http://www.verycd.com/sto/page' for i in range(off,num+1): print 'fetching list',i,'...' url = urlbase+str(i) res = download.httpfetch(url,needlogin=True) res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: q.put(topic) def fetchall(ran='1-max',debug=False): urlbase = 'http://www.verycd.com/archives/' if ran == '1-max': m1 = 1 res = urllib.urlopen(urlbase).read() m2 = int(re.compile(r'archives/(\d+)').search(res).group(1)) else: m = ran.split('-') m1 = int(m[0]) if m[1]=='max': res = urllib.urlopen(urlbase).read() m2 = int(re.compile(r'archives/(\d+)').search(res).group(1)) else: m2 = int(m[1]) print 'fetching list from',m1,'to',m2,'...' for i in range(m1,m2+1): url = urlbase + '%05d'%i + '.html' print 'fetching from',url,'...' res = download.httpfetch(url) ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res) print ids for id in ids: q.put(id) def ensure_dir(f): d = os.path.dirname(f) if not os.path.exists(d): os.makedirs(d) def clear_idcache(id): #clear idcache: try: l1 = str(long(id)%10) l2 = str(long(id)/10%100) l3 = str(long(id)/1000) cachefile = path + '/idcache/%s/%s/%s.html'%(l1,l2,l3) if os.path.exists(cachefile): os.remove(cachefile) except: pass def cache_image(links): if not links: return for l in links: #http://image-7.verycd.com/asdlkfjasdlfjsadkf()/thumb.jpg try: print '___caching',l f = re.compile(r'http://[^/]*').sub(r'',l) f = f.replace('/','') ensure_dir(path+'/imgcache/1') ensure_dir(path+'/imgcache/%s/1'%f[0]) ensure_dir(path+'/imgcache/%s/%s/1'%(f[0],f[1:3])) f = path + '/imgcache/%s/%s/'%(f[0],f[1:3])+f if (not os.path.exists(f)) or (os.path.getsize(f) == 0): open(f,'w').write(download.httpfetch(l)) #open(f,'w').write(urllib.urlopen(l).read()) except: pass def fetch(id,conn=conn,debug=False,dbl=dbl,statdb=statdb,needlogin=False,updtime=None): # db=MySQLdb.connect(user='root',passwd='guess8',db='simplecd') print 'fetching topic',id,'...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) res = '' for _ in range(3): try: res = download.httpfetch(url,report=True,needlogin=needlogin) break except: continue abstract = re.compile(r']*>(.*?)',re.DOTALL).findall(abstract) if title: title=title[0] else: return try: status = re.compile(r'requestIcon"[^>]*>\s*]*>(.*?)<',re.DOTALL).search(abstract).group(1) brief = re.compile(r'摘要.*?(.*?)',re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip() pubtime = re.compile(r'时间.*?.*?date-time.*?>(.*?).*?date-time.*?>(.*?)',re.DOTALL).findall(abstract)[0] if updtime: pubtime = list(pubtime) pubtime[1] = updtime category = re.compile(r'align:top;">分类.*?.*?>.*?>(.*?).*?>(.*?)',re.DOTALL).findall(abstract) if not category: category = re.compile(r'align:top;">分类.*?.*?>\s*(.*?)\s+(.*?)\s*',re.DOTALL).findall(abstract) category = list(category[0]) category[0] = category[0].replace(' ','') category[1] = category[1].replace(' ','') ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)[^<]*>([^<]*)',re.DOTALL).findall(res) ed2k.extend( re.compile(r'ed2k="([^"]*)"[^>]*>([^<]*)',re.DOTALL).findall(res) ) #delete duplicates newed2k = ed2k for i in range(len(ed2k)-1,-1,-1): if ed2k[i] in ed2k[:i]: newed2k.remove(ed2k[i]) content = re.compile(r'iptcomContents">(.*?)',re.DOTALL).findall(res) except Exception as what: print what return if content: content = content[0] imglinks = re.compile(r'src="(http://image-\d*\.verycd\.com/[^"]*)"',re.I).findall(content) try: cache_image(imglinks) except: pass content = re.compile(r'<(/?OBJECT.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(/?PARAM.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(/?EMBED.*?)>',re.DOTALL).sub(r'[\1]',content) content = re.compile(r'<(img .*?)>').sub(r'[\1]',content) content = re.compile(r'
',re.DOTALL).sub('\n',content) content = re.compile(r'<.*?>',re.DOTALL).sub('',content) content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content) content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content) content = re.compile(r'\[(img .*?)\]').sub(r'<\1>
',content) content = re.compile(r'\[(/?OBJECT.*?)\]').sub(r'<\1>',content) content = re.compile(r'\[(/?PARAM.*?)\]').sub(r'<\1>',content) content = re.compile(r'\[(/?EMBED.*?)\]').sub(r'<\1>',content) content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content) content = content.strip() else: content='' vcpv = 0 #fetch stat try: try: staturl = 'http://stat.verycd.com/counters/folder/'+str(id)+'/' st = urllib2.urlopen(staturl).read() #st = download.httpfetch(staturl) vcpv = int(re.compile(r'\'(\d+)\'').findall(st)[0]) except Exception as what: print what vcpv = 0 c2 = statdb.cursor() c2.execute('select * from t1 where id=%s',(id,)) r = [x for x in c2.fetchall()] if r == []: # empty, insert c2.execute('insert into t1 (id,comments,hits,score,title,brief,category1,updtime,status,vcpv) values \ (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (id,0,0,0,title,brief,category[0],pubtime[1],status,vcpv) ) else: # update c2.execute('update t1 set vcpv=%s,status=%s where id=%s',(vcpv,status,id)) except Exception as what: print what pass c2.close() statdb.commit() # update lock # owner = re.compile(r'
.*?

(.*)',re.DOTALL).findall(res) owner = re.compile(r''']*>(.*?)''',re.M).findall(res) if owner: owner = owner[0] g_mutex.acquire() cl=dbl.cursor() try: cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv)) except: pass while True: try: dbl.commit() break except: pass cl.close() g_mutex.release() if debug: if vcpv: print vcpv if owner: print owner print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' if ed2kstr == '': ed2kpage = re.compile(r'href="(http://www\.verycd\.com/search/files/.*?rel)"').findall(res) if ed2kpage: url = ed2kpage[-1] ed2kpage = download.httpfetch(url,report=True,needlogin=needlogin) starts = re.compile(r'''javascript:generateUrl\('start',(\d+)\)''').findall(ed2kpage) for start in set(starts): ed2kpage += download.httpfetch(url+"&start=%s"%start,report=True,needlogin=needlogin) ed2ks = re.compile(r'ed2k://.*?\|/').findall(ed2kpage) ed2ks = set(ed2ks) ed2ks = sorted(list(ed2ks)) for ed2k in ed2ks: ed2kstr += ed2k + '`' + ed2k.split('|')[2] + '`' else: return tries=0 try: if not dbfind(id,conn): dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn) else: dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn) except: pass clear_idcache(id) #pp = '/var/www/simplecd.old/gz/'+str(id)[0:2]+'/'+str(id)[2:4]+'/' #nn = str(id)+'.gz' #ensure_dir(pp) #import gzip #gzip.open(pp+nn,'wb').write(res) # mysql c = statdb.cursor() c.execute("replace into verycd values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'') ) statdb.commit() c.close() # mysql # c = db.cursor() # sql = "replace into verycd values (%s,'%s','%s','%s','%s',%s,'%s','%s','%s','%s','%s')" % \ # (str(id), title.replace("'","\\'"), status.replace("'","\\'"), brief.replace("'","\\'"), # pubtime[0].replace("'","\\'"), str(int(time.mktime(time.strptime(pubtime[1],'%Y/%m/%d %H:%M:%S')))), # category[0].replace("'","\\'"), category[1].replace("'","\\'"), # ed2kstr.replace("'","\\'"), content.replace("'","\\'"), '' ) # c.execute(sql) # db.commit() # c.close() return pubtime[1] def dbcreate(): c = conn.cursor() c.execute('''create table verycd( verycdid integer primary key, title text, status text, brief text, pubtime text, updtime text, category1 text, category2 text, ed2k text, content text )''') conn.commit() c.close() def dbinsert(id,title,status,brief,pubtime,category,ed2k,content,conn): c = conn.cursor() tries = 0 while tries<10: try: c.execute('insert into verycd values(?,?,?,?,?,?,?,?,?,?,?)',\ (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\ ed2k,content,'')) break except: tries += 1 time.sleep(5) continue conn.commit() c.close() def dbupdate(id,title,status,brief,pubtime,category,ed2k,content,conn): tries = 0 c = conn.cursor() while tries<5: try: c.execute('update verycd set title=?,status=?,brief=?,pubtime=?,\ updtime=?,category1=?,category2=?,ed2k=?,content=? where verycdid=?',\ (title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\ ed2k,content,id)) break except: tries += 1 time.sleep(1) continue conn.commit() c.close() def dbfind(id,conn): c = conn.cursor() c.execute('select 1 from verycd where verycdid=?',(id,)) c.close() for x in c: if 1 in x: return True else: return False def dblist(): c = conn.cursor() c.execute('select * from verycd') for x in c: for y in x: print y def usage(): print '''Usage: python fetchvc.py createdb python fetchvc.py fetchall python fetchvc.py fetch 1-1611 #fetch archive list python fetchvc.py fetch 5633~5684 #fetch topics python fetchvc.py fetch 5633 #fetch a topic python fetchvc.py fetch q=keyword python fetchvc.py list #list the database python fetchvc.py feed #run every 30 min to keep up-to-date python fetchvc.py hot python fetchvc.py update #update first 20 pages, run on a daily basis''' #initialize thread pool for i in range(MAXC): t = Thread(target=thread_fetch) t.setDaemon(True) t.start() if __name__=='__main__': if len(sys.argv) == 1: usage() elif len(sys.argv) == 2: if sys.argv[1] == 'createdb': dbcreate() elif sys.argv[1] == 'fetchall': fetchall() elif sys.argv[1].startswith('update'): if sys.argv[1] == 'update': update(20) else: ran = sys.argv[1][6:].split('-') if len(ran) == 2: update(int(ran[1]),int(ran[0])) else: update(int(ran[0])) elif sys.argv[1] == 'feed': feed() elif sys.argv[1] == 'hot': hot() elif sys.argv[1] == 'list': dblist() elif len(sys.argv) == 3: if sys.argv[1] != 'fetch': usage() elif '~' in sys.argv[2]: m = sys.argv[2].split('~') for i in range(int(m[0]),int(m[1])+1): q.put(i) elif sys.argv[2].startswith("q="): search(sys.argv[2][2:]) elif sys.argv[2].startswith("n="): normal(sys.argv[2][2:]) elif sys.argv[2].startswith("r="): request(sys.argv[2][2:]) elif sys.argv[2].startswith("a="): all(sys.argv[2][2:]) elif '-' in sys.argv[2]: fetchall(sys.argv[2]) else: fetch(int(sys.argv[2]),debug=True) # wait all threads done q.join()