#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# fetchvc.py fetch resources from verycd
#
# author: observer
# email: jingchaohu@gmail.com
# blog: http://obmem.com
# last edit @ 2009.12.23
import urllib
import re
import sqlite3,MySQLdb
import time
import os,sys
from threading import Thread
from Queue import Queue
import download
path = os.path.dirname(os.path.realpath(sys.argv[0]))
#conn = sqlite3.connect(path+'/comment.sqlite3.db')
#conn.text_factory = str
conn = MySQLdb.connect(user='root',passwd='guess8',db='simplecd')
q = Queue()
MAXC = 8
def thread_fetch():
conn = sqlite3.connect(path+'/comment.sqlite3.db')
conn.text_factory = str
while True:
topic = q.get()
fetchcmt(topic,conn)
q.task_done()
def search(keyword,full=True):
'''search verycd, fetch search results'''
searchlog = path+'/search.log'
open(searchlog,'a').write('\n'+keyword+'\n')
url = 'http://www.verycd.com/search/folders/'+keyword
print 'fetching search results ...'
res = download.httpfetch(url)
topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res)
topics = set(topics)
links = []
if full:
links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res)
print links
print topics
if topics:
for topic in topics:
open(searchlog,'a').write(topic+',')
q.put(topic)
if full and links:
for key in links:
search(key,full=False)
def hot():
''' read verycd hot res and keep update very day '''
url = 'http://www.verycd.com/'
print 'fetching homepage ...'
home = download.httpfetch(url)
hotzone = re.compile(r'热门资源.*?',re.DOTALL).search(home).group()
hot = re.compile(r']*>(《.*?》)[^<]*',re.DOTALL).findall(hotzone)
html = '
每日热门资源
\n'
for topic in hot:
print 'fetching hot topic',topic[0],'...'
q.put(topic[0])
html += ' %s \n' % topic
open(path+'/static/hot.html','w').write(html)
def normal(pages):
'''fetch normal res that need login'''
if '-' in pages:
(f,t)=[ int(x) for x in pages.split('-') ]
else:
f = t = int(pages)
for page in range(f,t+1):
url = 'http://www.verycd.com/orz/page%d?stat=normal' % page
idx = download.httpfetch(url,needlogin=True)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
print ids[0]
for id in ids:
q.put(id)
def request(pages):
'''fetch request res that need login'''
if '-' in pages:
(f,t)=[ int(x) for x in pages.split('-') ]
else:
f = t = int(pages)
for page in range(f,t+1):
url = 'http://www.verycd.com/orz/page%d?stat=request' % page
idx = download.httpfetch(url,needlogin=True)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
print ids[0]
for id in ids:
q.put(id)
def feed():
''' read verycd feed and keep update very 30 min '''
url = 'http://www.verycd.com/sto/feed'
print 'fetching feed ...'
feeds = download.httpfetch(url)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds)
ids = set(ids)
print ids
now = time.mktime(time.gmtime())
for id in ids:
q.put(id)
#updtime = fetch(id)
#updtime = time.mktime(time.strptime(updtime,'%Y/%m/%d %H:%M:%S'))-8*3600 #gmt+8->gmt
#diff = now - updtime
#print '%10s secs since update' % (diff)
#if diff > 1900: # only need recent 30min updates
# break
def update(num=10,start=1):
urlbase = 'http://www.verycd.com/sto/~all/page'
for i in range(start,num+1):
print 'fetching list',i,'...'
url = urlbase+str(i)
res = download.httpfetch(url)
res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res)
if res2:
res2 = res2[0]
else:
continue
topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2)
topics = set(topics)
print topics
for topic in topics:
q.put(topic)
def fetchall(ran='1-max',debug=False):
urlbase = 'http://www.verycd.com/archives/'
if ran.endswith('max'):
m1 = int(ran.split('-')[0])
res = urllib.urlopen(urlbase).read()
m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
else:
m = ran.split('-')
m1 = int(m[0])
m2 = int(m[1])
print 'fetching list from',m1,'to',m2,'...'
for i in range(m1,m2+1):
url = urlbase + '%05d'%i + '.html'
print 'fetching from',url,'...'
res = download.httpfetch(url)
ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res)
print ids
for id in ids:
q.put(id)
def fetchcmt(id,conn=conn,debug=False,page=1,needlogin=False):
print 'fetching topic',id,'...'
urlbase = 'http://www.verycd.com/topics/'
url = urlbase + str(id) + '/comments/page' + str(page)
res = ''
for _ in range(3):
try:
res = download.httpfetch(url,report=True,needlogin=needlogin)
break
except:
continue
if page == 1:
pages = re.compile(r'/comments/page(\d+)').findall(res)
if pages:
pages = set(pages)
for page in pages:
if page != 1:
pass
#fetchcmt(id=id,conn=conn,page=page,debug=debug)
stmts = re.compile(r']*>([^<]*).*?(.*?).*?2009/12/17 02:02:51
stmts = [ [x[0].replace(r'<.*?>',r'').strip(),x[1].replace(r'<.*?>',r'').strip(),x[2].replace(r'<[^>]*>',r'').strip()] for x in stmts]
for i in range(len(stmts)):
stmts[i][2] = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',stmts[i][2])
stmts[i][2] = re.compile(r']*>',re.I).sub(r'',stmts[i][2])
stmts[i][2] = re.compile(r'
',re.I).sub(r'',stmts[i][2])
stmts[i][2] = re.compile(r'',re.I).sub(r'',stmts[i][2])
stmts = [ (int(id),x[0],x[2],int(time.mktime(time.strptime(x[1],'%Y/%m/%d %H:%M:%S')))-8*3600) for x in stmts ]
if debug:
print len(stmts)
for stmt in stmts:
print stmt[0],stmt[2],stmt[1]
tries = 0
while tries<5:
try:
c = conn.cursor()
c.executemany('replace into comment values (%s,%s,%s,%s)',stmts)
break
except:
tries += 1;
time.sleep(5);
continue;
c.close()
conn.commit()
return
def dbcreate():
c = conn.cursor()
c.execute('''create table comment(
id integer,
username text,
comment text,
time integer,
constraint p3 unique (id,username,time)
)''')
c.close()
conn.commit()
def dblist():
c = conn.cursor()
c.execute('select * from comment')
for x in c:
for y in x:
print y
def usage():
print '''Usage:
python fcmt.py createdb
python fcmt.py fetchall
python fcmt.py fetch 1-1611 #fetch archive list
python fcmt.py fetch 5633~5684 #fetch topics
python fcmt.py fetch 5633 #fetch a topic
python fcmt.py fetch q=keyword
python fcmt.py list #list the database
python fcmt.py feed #run every 30 min to keep up-to-date
python fcmt.py hot
python fcmt.py update #update first 20 pages, run on a daily basis'''
if __name__=='__main__':
if len(sys.argv) == 1:
usage()
elif len(sys.argv) == 2:
if sys.argv[1] == 'createdb':
dbcreate()
elif sys.argv[1] == 'fetchall':
fetchall()
elif sys.argv[1].startswith('update'):
if sys.argv[1] == 'update':
update(20)
else:
ran = sys.argv[1][6:].split('-')
if len(ran) == 2:
update(int(ran[1]),int(ran[0]))
else:
update(int(ran[0]))
elif sys.argv[1] == 'feed':
feed()
elif sys.argv[1] == 'hot':
hot()
elif sys.argv[1] == 'list':
dblist()
elif len(sys.argv) == 3:
if sys.argv[1] != 'fetch':
usage()
elif '~' in sys.argv[2]:
m = sys.argv[2].split('~')
for i in range(int(m[0]),int(m[1])+1):
q.put(i)
elif sys.argv[2].startswith("q="):
search(sys.argv[2][2:])
elif sys.argv[2].startswith("n="):
normal(sys.argv[2][2:])
elif sys.argv[2].startswith("r="):
request(sys.argv[2][2:])
elif '-' in sys.argv[2]:
fetchall(sys.argv[2])
else:
fetchcmt(int(sys.argv[2]),debug=True)
# wait all threads done
q.join()