#!/usr/bin/env python # -*- coding: utf-8 -*- # # fetchvc.py fetch resources from verycd # # author: observer # email: jingchaohu@gmail.com # blog: http://obmem.com # last edit @ 2009.12.23 import urllib,urllib2 import re import sqlite3 import time import os,sys import MySQLdb from threading import Thread,Lock,stack_size from Queue import Queue import download #path = os.path.dirname(os.path.realpath(sys.argv[0])) path = '/var/www/simplecd.old' conn = sqlite3.connect(path+'/verycd.sqlite3.db') dbl = sqlite3.connect(path+'/lock.sqlite3.db') mysqldb = statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8') dbl.text_factory = str conn.text_factory = str q = Queue() MAXC = 8 g_mutex = Lock() stack_size(32768*32) def thread_fetch(): conn = sqlite3.connect(path+'/verycd.sqlite3.db') conn.text_factory = str dbl = sqlite3.connect(path+'/lock.sqlite3.db') dbl.text_factory = str statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8') download.httpfetch('http://www.verycd.com',needlogin=True) while True: topic = q.get() fetch(topic,conn=conn,dbl=dbl,statdb=statdb) q.task_done() def search(keyword,full=True): '''search verycd, fetch search results''' searchlog = path+'/search.log' open(searchlog,'a').write('\n'+keyword+'\n') url = 'http://www.verycd.com/search/folders/'+keyword print 'fetching search results ...' res = download.httpfetch(url) topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res) links = [] if full: links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res) print links print topics if topics: for topic in topics: open(searchlog,'a').write(topic+',') q.put(topic) if full and links: for key in links: search(key,full=False) def hot(): ''' read verycd hot res and keep update very day ''' url = 'http://www.verycd.com/' print 'fetching homepage ...' home = download.httpfetch(url) hotzone = re.compile(r'热门资源.*?',re.DOTALL).search(home).group() hot = re.compile(r']*>(《.*?》)[^<]*',re.DOTALL).findall(hotzone) hot = [ x for x in hot ] html = '
(.*)',re.DOTALL).findall(res) owner = re.compile(r''']*>(.*?)''',re.M).findall(res) if owner: owner = owner[0] g_mutex.acquire() cl=dbl.cursor() try: cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv)) except: pass while True: try: dbl.commit() break except: pass cl.close() g_mutex.release() if debug: if vcpv: print vcpv if owner: print owner print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' if ed2kstr == '': ed2kpage = re.compile(r'href="(http://www\.verycd\.com/search/files/.*?rel)"').findall(res) if ed2kpage: url = ed2kpage[-1] ed2kpage = download.httpfetch(url,report=True,needlogin=needlogin) starts = re.compile(r'''javascript:generateUrl\('start',(\d+)\)''').findall(ed2kpage) for start in set(starts): ed2kpage += download.httpfetch(url+"&start=%s"%start,report=True,needlogin=needlogin) ed2ks = re.compile(r'ed2k://.*?\|/').findall(ed2kpage) ed2ks = set(ed2ks) ed2ks = sorted(list(ed2ks)) for ed2k in ed2ks: ed2kstr += ed2k + '`' + ed2k.split('|')[2] + '`' else: return tries=0 try: if not dbfind(id,conn): dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn) else: dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn) except: pass clear_idcache(id) #pp = '/var/www/simplecd.old/gz/'+str(id)[0:2]+'/'+str(id)[2:4]+'/' #nn = str(id)+'.gz' #ensure_dir(pp) #import gzip #gzip.open(pp+nn,'wb').write(res) # mysql c = statdb.cursor() c.execute("replace into verycd values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'') ) statdb.commit() c.close() # mysql # c = db.cursor() # sql = "replace into verycd values (%s,'%s','%s','%s','%s',%s,'%s','%s','%s','%s','%s')" % \ # (str(id), title.replace("'","\\'"), status.replace("'","\\'"), brief.replace("'","\\'"), # pubtime[0].replace("'","\\'"), str(int(time.mktime(time.strptime(pubtime[1],'%Y/%m/%d %H:%M:%S')))), # category[0].replace("'","\\'"), category[1].replace("'","\\'"), # ed2kstr.replace("'","\\'"), content.replace("'","\\'"), '' ) # c.execute(sql) # db.commit() # c.close() return pubtime[1] def dbcreate(): c = conn.cursor() c.execute('''create table verycd( verycdid integer primary key, title text, status text, brief text, pubtime text, updtime text, category1 text, category2 text, ed2k text, content text )''') conn.commit() c.close() def dbinsert(id,title,status,brief,pubtime,category,ed2k,content,conn): c = conn.cursor() tries = 0 while tries<10: try: c.execute('insert into verycd values(?,?,?,?,?,?,?,?,?,?,?)',\ (id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\ ed2k,content,'')) break except: tries += 1 time.sleep(5) continue conn.commit() c.close() def dbupdate(id,title,status,brief,pubtime,category,ed2k,content,conn): tries = 0 c = conn.cursor() while tries<5: try: c.execute('update verycd set title=?,status=?,brief=?,pubtime=?,\ updtime=?,category1=?,category2=?,ed2k=?,content=? where verycdid=?',\ (title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\ ed2k,content,id)) break except: tries += 1 time.sleep(1) continue conn.commit() c.close() def dbfind(id,conn): c = conn.cursor() c.execute('select 1 from verycd where verycdid=?',(id,)) c.close() for x in c: if 1 in x: return True else: return False def dblist(): c = conn.cursor() c.execute('select * from verycd') for x in c: for y in x: print y def usage(): print '''Usage: python fetchvc.py createdb python fetchvc.py fetchall python fetchvc.py fetch 1-1611 #fetch archive list python fetchvc.py fetch 5633~5684 #fetch topics python fetchvc.py fetch 5633 #fetch a topic python fetchvc.py fetch q=keyword python fetchvc.py list #list the database python fetchvc.py feed #run every 30 min to keep up-to-date python fetchvc.py hot python fetchvc.py update #update first 20 pages, run on a daily basis''' #initialize thread pool for i in range(MAXC): t = Thread(target=thread_fetch) t.setDaemon(True) t.start() if __name__=='__main__': if len(sys.argv) == 1: usage() elif len(sys.argv) == 2: if sys.argv[1] == 'createdb': dbcreate() elif sys.argv[1] == 'fetchall': fetchall() elif sys.argv[1].startswith('update'): if sys.argv[1] == 'update': update(20) else: ran = sys.argv[1][6:].split('-') if len(ran) == 2: update(int(ran[1]),int(ran[0])) else: update(int(ran[0])) elif sys.argv[1] == 'feed': feed() elif sys.argv[1] == 'hot': hot() elif sys.argv[1] == 'list': dblist() elif len(sys.argv) == 3: if sys.argv[1] != 'fetch': usage() elif '~' in sys.argv[2]: m = sys.argv[2].split('~') for i in range(int(m[0]),int(m[1])+1): q.put(i) elif sys.argv[2].startswith("q="): search(sys.argv[2][2:]) elif sys.argv[2].startswith("n="): normal(sys.argv[2][2:]) elif sys.argv[2].startswith("r="): request(sys.argv[2][2:]) elif sys.argv[2].startswith("a="): all(sys.argv[2][2:]) elif '-' in sys.argv[2]: fetchall(sys.argv[2]) else: fetch(int(sys.argv[2]),debug=True) # wait all threads done q.join()