#!/usr/bin/env python # -*- coding: utf-8 -*- # # fetchvc.py fetch resources from verycd # # author: observer # email: jingchaohu@gmail.com # blog: http://obmem.com # last edit @ 2009.12.23 import urllib import re import sqlite3,MySQLdb import time import os,sys from threading import Thread from Queue import Queue import download path = os.path.dirname(os.path.realpath(sys.argv[0])) #conn = sqlite3.connect(path+'/comment.sqlite3.db') #conn.text_factory = str conn = MySQLdb.connect(user='root',passwd='guess8',db='simplecd') q = Queue() MAXC = 8 def thread_fetch(): conn = sqlite3.connect(path+'/comment.sqlite3.db') conn.text_factory = str while True: topic = q.get() fetchcmt(topic,conn) q.task_done() def search(keyword,full=True): '''search verycd, fetch search results''' searchlog = path+'/search.log' open(searchlog,'a').write('\n'+keyword+'\n') url = 'http://www.verycd.com/search/folders/'+keyword print 'fetching search results ...' res = download.httpfetch(url) topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res) topics = set(topics) links = [] if full: links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res) print links print topics if topics: for topic in topics: open(searchlog,'a').write(topic+',') q.put(topic) if full and links: for key in links: search(key,full=False) def hot(): ''' read verycd hot res and keep update very day ''' url = 'http://www.verycd.com/' print 'fetching homepage ...' home = download.httpfetch(url) hotzone = re.compile(r'热门资源.*?',re.DOTALL).search(home).group() hot = re.compile(r']*>(《.*?》)[^<]*',re.DOTALL).findall(hotzone) html = '

每日热门资源

\n' for topic in hot: print 'fetching hot topic',topic[0],'...' q.put(topic[0]) html += ' %s \n' % topic open(path+'/static/hot.html','w').write(html) def normal(pages): '''fetch normal res that need login''' if '-' in pages: (f,t)=[ int(x) for x in pages.split('-') ] else: f = t = int(pages) for page in range(f,t+1): url = 'http://www.verycd.com/orz/page%d?stat=normal' % page idx = download.httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id) def request(pages): '''fetch request res that need login''' if '-' in pages: (f,t)=[ int(x) for x in pages.split('-') ] else: f = t = int(pages) for page in range(f,t+1): url = 'http://www.verycd.com/orz/page%d?stat=request' % page idx = download.httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id) def feed(): ''' read verycd feed and keep update very 30 min ''' url = 'http://www.verycd.com/sto/feed' print 'fetching feed ...' feeds = download.httpfetch(url) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds) ids = set(ids) print ids now = time.mktime(time.gmtime()) for id in ids: q.put(id) #updtime = fetch(id) #updtime = time.mktime(time.strptime(updtime,'%Y/%m/%d %H:%M:%S'))-8*3600 #gmt+8->gmt #diff = now - updtime #print '%10s secs since update' % (diff) #if diff > 1900: # only need recent 30min updates # break def update(num=10,start=1): urlbase = 'http://www.verycd.com/sto/~all/page' for i in range(start,num+1): print 'fetching list',i,'...' url = urlbase+str(i) res = download.httpfetch(url) res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: q.put(topic) def fetchall(ran='1-max',debug=False): urlbase = 'http://www.verycd.com/archives/' if ran.endswith('max'): m1 = int(ran.split('-')[0]) res = urllib.urlopen(urlbase).read() m2 = int(re.compile(r'archives/(\d+)').search(res).group(1)) else: m = ran.split('-') m1 = int(m[0]) m2 = int(m[1]) print 'fetching list from',m1,'to',m2,'...' for i in range(m1,m2+1): url = urlbase + '%05d'%i + '.html' print 'fetching from',url,'...' res = download.httpfetch(url) ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res) print ids for id in ids: q.put(id) def fetchcmt(id,conn=conn,debug=False,page=1,needlogin=False): print 'fetching topic',id,'...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) + '/comments/page' + str(page) res = '' for _ in range(3): try: res = download.httpfetch(url,report=True,needlogin=needlogin) break except: continue if page == 1: pages = re.compile(r'/comments/page(\d+)').findall(res) if pages: pages = set(pages) for page in pages: if page != 1: pass #fetchcmt(id=id,conn=conn,page=page,debug=debug) stmts = re.compile(r']*>([^<]*).*?(.*?).*?2009/12/17 02:02:51 stmts = [ [x[0].replace(r'<.*?>',r'').strip(),x[1].replace(r'<.*?>',r'').strip(),x[2].replace(r'<[^>]*>',r'').strip()] for x in stmts] for i in range(len(stmts)): stmts[i][2] = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',stmts[i][2]) stmts[i][2] = re.compile(r']*>',re.I).sub(r'',stmts[i][2]) stmts[i][2] = re.compile(r'',re.I).sub(r'',stmts[i][2]) stmts[i][2] = re.compile(r'',re.I).sub(r'',stmts[i][2]) stmts = [ (int(id),x[0],x[2],int(time.mktime(time.strptime(x[1],'%Y/%m/%d %H:%M:%S')))-8*3600) for x in stmts ] if debug: print len(stmts) for stmt in stmts: print stmt[0],stmt[2],stmt[1] tries = 0 while tries<5: try: c = conn.cursor() c.executemany('replace into comment values (%s,%s,%s,%s)',stmts) break except: tries += 1; time.sleep(5); continue; c.close() conn.commit() return def dbcreate(): c = conn.cursor() c.execute('''create table comment( id integer, username text, comment text, time integer, constraint p3 unique (id,username,time) )''') c.close() conn.commit() def dblist(): c = conn.cursor() c.execute('select * from comment') for x in c: for y in x: print y def usage(): print '''Usage: python fcmt.py createdb python fcmt.py fetchall python fcmt.py fetch 1-1611 #fetch archive list python fcmt.py fetch 5633~5684 #fetch topics python fcmt.py fetch 5633 #fetch a topic python fcmt.py fetch q=keyword python fcmt.py list #list the database python fcmt.py feed #run every 30 min to keep up-to-date python fcmt.py hot python fcmt.py update #update first 20 pages, run on a daily basis''' if __name__=='__main__': if len(sys.argv) == 1: usage() elif len(sys.argv) == 2: if sys.argv[1] == 'createdb': dbcreate() elif sys.argv[1] == 'fetchall': fetchall() elif sys.argv[1].startswith('update'): if sys.argv[1] == 'update': update(20) else: ran = sys.argv[1][6:].split('-') if len(ran) == 2: update(int(ran[1]),int(ran[0])) else: update(int(ran[0])) elif sys.argv[1] == 'feed': feed() elif sys.argv[1] == 'hot': hot() elif sys.argv[1] == 'list': dblist() elif len(sys.argv) == 3: if sys.argv[1] != 'fetch': usage() elif '~' in sys.argv[2]: m = sys.argv[2].split('~') for i in range(int(m[0]),int(m[1])+1): q.put(i) elif sys.argv[2].startswith("q="): search(sys.argv[2][2:]) elif sys.argv[2].startswith("n="): normal(sys.argv[2][2:]) elif sys.argv[2].startswith("r="): request(sys.argv[2][2:]) elif '-' in sys.argv[2]: fetchall(sys.argv[2]) else: fetchcmt(int(sys.argv[2]),debug=True) # wait all threads done q.join()