#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# fetchvc.py fetch resources from verycd
#
# author: observer
# email: jingchaohu@gmail.com
# blog: http://obmem.com
# last edit @ 2009.12.23
import urllib,urllib2
import re
import sqlite3
import time
import os,sys
import MySQLdb
import memcache
from hashlib import md5
from threading import Thread,Lock
from Queue import Queue
import download
#path = os.path.dirname(os.path.realpath(sys.argv[0]))
path = '/var/www/simplecd.old'
conn = sqlite3.connect(path+'/verycd.sqlite3.db')
dbl = sqlite3.connect(path+'/lock.sqlite3.db')
mysqldb = statdb = MySQLdb.connect(user='root',passwd='guess8',db='simplecd')
dbl.text_factory = str
conn.text_factory = str
q = Queue()
MAXC = 8
g_mutex = Lock()
def thread_fetch():
conn = sqlite3.connect(path+'/verycd.sqlite3.db')
conn.text_factory = str
dbl = sqlite3.connect(path+'/lock.sqlite3.db')
dbl.text_factory = str
statdb = MySQLdb.connect(user='root',passwd='guess8',db='simplecd')
while True:
topic = q.get()
fetch(topic,conn=conn,dbl=dbl,statdb=statdb)
q.task_done()
def search(keyword,full=True):
'''search verycd, fetch search results'''
searchlog = path+'/search.log'
open(searchlog,'a').write('\n'+keyword+'\n')
url = 'http://www.verycd.com/search/folders/'+keyword
print 'fetching search results ...'
res = download.httpfetch(url)
topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res)
links = []
if full:
links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res)
print links
print topics
if topics:
for topic in topics:
open(searchlog,'a').write(topic+',')
q.put(topic)
if full and links:
for key in links:
search(key,full=False)
def hot():
''' read verycd hot res and keep update very day '''
url = 'http://www.verycd.com/'
print 'fetching homepage ...'
home = download.httpfetch(url)
hotzone = re.compile(r'热门资源.*?',re.DOTALL).search(home).group()
hot = re.compile(r']*>(《.*?》)[^<]*',re.DOTALL).findall(hotzone)
html = '
每日热门资源
\n'
for topic in hot:
print 'fetching hot topic',topic[0],'...'
q.put(topic[0])
html += ' %s \n' % topic
open(path+'/static/hot.html','w').write(html)
def normal(pages):
'''fetch normal res that need login'''
if '-' in pages:
(f,t)=[ int(x) for x in pages.split('-') ]
else:
f = t = int(pages)
for page in range(f,t+1):
url = 'http://www.verycd.com/orz/page%d?stat=normal' % page
idx = download.httpfetch(url,needlogin=True)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
print ids[0]
for id in ids:
q.put(id)
def request(pages):
'''fetch request res that need login'''
if '-' in pages:
(f,t)=[ int(x) for x in pages.split('-') ]
else:
f = t = int(pages)
for page in range(f,t+1):
url = 'http://www.verycd.com/orz/page%d?stat=request' % page
idx = download.httpfetch(url,needlogin=True)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
print ids[0]
for id in ids:
q.put(id)
def all(pages):
'''fetch request res that need login'''
if '-' in pages:
(f,t)=[ int(x) for x in pages.split('-') ]
else:
f = t = int(pages)
for page in range(f,t+1):
url = 'http://www.verycd.com/orz/page%d?stat=all' % page
idx = download.httpfetch(url,needlogin=True)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx)
print ids[0]
for id in ids:
q.put(id)
def feed():
''' read verycd feed and keep update very 30 min '''
url = 'http://www.verycd.com/sto/feed'
print 'fetching feed ...'
feeds = download.httpfetch(url)
ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds)
ids = set(ids)
print ids
now = time.mktime(time.gmtime())
for id in ids:
q.put(id)
#updtime = fetch(id)
#updtime = time.mktime(time.strptime(updtime,'%Y/%m/%d %H:%M:%S'))-8*3600 #gmt+8->gmt
#diff = now - updtime
#print '%10s secs since update' % (diff)
#if diff > 1900: # only need recent 30min updates
# break
def update(num=10,off=1):
urlbase = 'http://www.verycd.com/sto/~all/page'
for i in range(off,num+1):
print 'fetching list',i,'...'
url = urlbase+str(i)
res = download.httpfetch(url,needlogin=True)
res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res)
if res2:
res2 = res2[0]
else:
continue
topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2)
topics = set(topics)
print topics
for topic in topics:
q.put(topic)
def fetchall(ran='1-max',debug=False):
urlbase = 'http://www.verycd.com/archives/'
if ran == '1-max':
m1 = 1
res = urllib.urlopen(urlbase).read()
m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
else:
m = ran.split('-')
m1 = int(m[0])
if m[1]=='max':
res = urllib.urlopen(urlbase).read()
m2 = int(re.compile(r'archives/(\d+)').search(res).group(1))
else:
m2 = int(m[1])
print 'fetching list from',m1,'to',m2,'...'
for i in range(m1,m2+1):
url = urlbase + '%05d'%i + '.html'
print 'fetching from',url,'...'
res = download.httpfetch(url)
ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res)
print ids
for id in ids:
q.put(id)
def ensure_dir(f):
d = os.path.dirname(f)
if not os.path.exists(d):
os.makedirs(d)
def clear_idcache(id):
#clear idcache:
try:
l1 = str(long(id)%10)
l2 = str(long(id)/10%100)
l3 = str(long(id)/1000)
cachefile = path + '/idcache/%s/%s/%s.html'%(l1,l2,l3)
if os.path.exists(cachefile):
os.remove(cachefile)
mc = memcache.Client(['127.0.0.1:11211'], debug=0)
mc.delete(md5('idsum'+str(id)).hexdigest())
except:
pass
def cache_image(links):
if not links:
return
for l in links:
#http://image-7.verycd.com/asdlkfjasdlfjsadkf()/thumb.jpg
try:
print '___caching',l
f = re.compile(r'http://[^/]*').sub(r'',l)
f = f.replace('/','')
ensure_dir(path+'/imgcache/1')
ensure_dir(path+'/imgcache/%s/1'%f[0])
ensure_dir(path+'/imgcache/%s/%s/1'%(f[0],f[1:3]))
f = path + '/imgcache/%s/%s/'%(f[0],f[1:3])+f
if (not os.path.exists(f)) or (os.path.getsize(f) == 0):
open(f,'w').write(download.httpfetch(l))
#open(f,'w').write(urllib.urlopen(l).read())
except:
pass
def fetch(id,conn=conn,debug=False,dbl=dbl,statdb=statdb,needlogin=False,cache=True,updtime=None):
# db=MySQLdb.connect(user='root',passwd='guess8',db='simplecd')
print 'fetching topic',id,'...'
urlbase = 'http://www.verycd.com/topics/'
url = urlbase + str(id)
res = ''
for _ in range(3):
try:
res = download.httpfetch(url,report=True,needlogin=needlogin)
break
except:
continue
abstract = re.compile(r']*>(.*?)',re.DOTALL).findall(abstract)
if title:
title=title[0]
else:
return
try:
status = re.compile(r'requestIcon"[^>]*>\s*]*>(.*?)<',re.DOTALL).search(abstract).group(1)
brief = re.compile(r'摘要.*?(.*?)',re.DOTALL).search(abstract).group(1)
brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
pubtime = re.compile(r'时间.*?.*?date-time.*?>(.*?).*?date-time.*?>(.*?)',re.DOTALL).findall(abstract)[0]
if updtime:
pubtime = list(pubtime)
pubtime[1] = updtime
category = re.compile(r'align:top;">分类.*?.*?>.*?>(.*?).*?>(.*?)',re.DOTALL).findall(abstract)
if not category:
category = re.compile(r'align:top;">分类.*?.*?>\s*(.*?)\s+(.*?)\s*',re.DOTALL).findall(abstract)
category = list(category[0])
category[0] = category[0].replace(' ','')
category[1] = category[1].replace(' ','')
ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)[^<]*>([^<]*)',re.DOTALL).findall(res)
ed2k.extend( re.compile(r'ed2k="([^"]*)"[^>]*>([^<]*)',re.DOTALL).findall(res) )
#delete duplicates
newed2k = ed2k
for i in range(len(ed2k)-1,-1,-1):
if ed2k[i] in ed2k[:i]:
newed2k.remove(ed2k[i])
#content = re.compile(r'.*?iptcomContents.*?',re.DOTALL).findall(res)
content = re.compile(r'iptcomContents">(.*?)',re.DOTALL).findall(res)
except Exception as what:
print what
return
if content:
content = content[0]
imglinks = re.compile(r'src="(http://image-\d*\.verycd\.com/[^"]*)"',re.I).findall(content)
try:
# s = Thread(target=cache_image(imglinks))
# s.start()
if cache:
cache_image(imglinks)
except:
pass
content = re.compile(r'<(/?OBJECT.*?)>',re.DOTALL).sub(r'[\1]',content)
content = re.compile(r'<(/?PARAM.*?)>',re.DOTALL).sub(r'[\1]',content)
content = re.compile(r'<(/?EMBED.*?)>',re.DOTALL).sub(r'[\1]',content)
content = re.compile(r'<(img .*?)>').sub(r'[\1]',content)
content = re.compile(r'
',re.DOTALL).sub('\n',content)
content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
content = re.compile(r'\[(img .*?)\]').sub(r'<\1>
',content)
content = re.compile(r'\[(/?OBJECT.*?)\]').sub(r'<\1>',content)
content = re.compile(r'\[(/?PARAM.*?)\]').sub(r'<\1>',content)
content = re.compile(r'\[(/?EMBED.*?)\]').sub(r'<\1>',content)
content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content)
content = content.strip()
else:
content=''
vcpv = 0
#fetch stat
c2 = statdb.cursor()
try:
try:
staturl = 'http://stat.verycd.com/counters/folder/'+str(id)+'/'
#st = download.httpfetch(staturl)
st = urllib2.urlopen(staturl).read()
vcpv = int(re.compile(r'\'(\d+)\'').findall(st)[0])
except Exception as what:
vcpv = 0
c2.execute('select * from t1 where id=%s',(id,))
r = [x for x in c2.fetchall()]
if r == []: # empty, insert
c2.execute('insert into t1 (id,comments,hits,score,title,brief,category1,updtime,status,vcpv) values \
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (id,0,0,0,title,brief,category[0],pubtime[1],status,vcpv) )
else: # update
c2.execute('update t1 set vcpv=%s,status=%s where id=%s',(vcpv,status,id))
except Exception as what:
print what
pass
c2.close()
statdb.commit()
# update lock
owner = re.compile(r''']*>(.*?)''',re.M).findall(res)
if owner:
owner = owner[0]
g_mutex.acquire()
cl=dbl.cursor()
try:
cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv))
except:
pass
while True:
try:
dbl.commit()
break
except:
pass
cl.close()
g_mutex.release()
if debug:
if vcpv:
print vcpv
if owner:
print owner
print title
print status
print brief
print pubtime[0],pubtime[1]
print category[0],category[1]
for x in ed2k:
print x
print content
ed2kstr = ''
for x in ed2k:
ed2kstr += '`'.join(x)+'`'
if ed2kstr == '':
ed2kpage = re.compile(r'href="(http://www\.verycd\.com/search/files/.*?rel)"').findall(res)
if ed2kpage:
url = ed2kpage[-1]
ed2kpage = download.httpfetch(url,report=True,needlogin=needlogin)
starts = re.compile(r'''javascript:generateUrl\('start',(\d+)\)''').findall(ed2kpage)
for start in set(starts):
ed2kpage += download.httpfetch(url+"&start=%s"%start,report=True,needlogin=needlogin)
ed2ks = re.compile(r'ed2k://.*?\|/').findall(ed2kpage)
ed2ks = sorted(list(set(ed2ks)))
for ed2k in ed2ks:
ed2kstr += ed2k + '`' + ed2k.split('|')[2] + '`'
else:
return
tries=0
try:
g_mutex.acquire()
if not dbfind(id,conn):
dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
else:
dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn)
g_mutex.release()
except:
g_mutex.release()
clear_idcache(id)
#pp = '/var/www/simplecd.old/gz/'+str(id)[0:2]+'/'+str(id)[2:4]+'/'
#nn = str(id)+'.gz'
#ensure_dir(pp)
#import gzip
#gzip.open(pp+nn,'wb').write(res)
# mysql
c = mysqldb.cursor()
c.execute("replace into verycd values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'')
)
mysqldb.commit()
c.close()
return pubtime[1]
def dbcreate():
c = conn.cursor()
c.execute('''create table verycd(
verycdid integer primary key,
title text,
status text,
brief text,
pubtime text,
updtime text,
category1 text,
category2 text,
ed2k text,
content text
)''')
conn.commit()
c.close()
def dbinsert(id,title,status,brief,pubtime,category,ed2k,content,conn):
c = conn.cursor()
tries = 0
while tries<2:
try:
c.execute('insert into verycd values(?,?,?,?,?,?,?,?,?,?,?)',\
(id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\
ed2k,content,''))
break
except:
tries += 1
time.sleep(1)
continue
c.close()
conn.commit()
def dbupdate(id,title,status,brief,pubtime,category,ed2k,content,conn):
tries = 0
c = conn.cursor()
while tries<2:
try:
c.execute('update verycd set title=?,status=?,brief=?,pubtime=?,\
updtime=?,category1=?,category2=?,ed2k=?,content=? where verycdid=?',\
(title,status,brief,pubtime[0],pubtime[1],category[0],category[1],\
ed2k,content,id))
break
except:
tries += 1
time.sleep(1)
continue
c.close()
conn.commit()
def dbfind(id,conn):
c = conn.cursor()
c.execute('select 1 from verycd where verycdid=?',(id,))
c.close()
for x in c:
if 1 in x:
return True
else:
return False
def dblist():
c = conn.cursor()
c.execute('select * from verycd')
for x in c:
for y in x:
print y
def usage():
print '''Usage:
python fetchvc.py createdb
python fetchvc.py fetchall
python fetchvc.py fetch 1-1611 #fetch archive list
python fetchvc.py fetch 5633~5684 #fetch topics
python fetchvc.py fetch 5633 #fetch a topic
python fetchvc.py fetch q=keyword
python fetchvc.py list #list the database
python fetchvc.py feed #run every 30 min to keep up-to-date
python fetchvc.py hot
python fetchvc.py update #update first 20 pages, run on a daily basis'''
#initialize thread pool
#for i in range(MAXC):
# t = Thread(target=thread_fetch)
# t.setDaemon(True)
# t.start()
if __name__=='__main__':
if len(sys.argv) == 1:
usage()
elif len(sys.argv) == 2:
if sys.argv[1] == 'createdb':
dbcreate()
elif sys.argv[1] == 'fetchall':
fetchall()
elif sys.argv[1].startswith('update'):
if sys.argv[1] == 'update':
update(20)
else:
ran = sys.argv[1][6:].split('-')
if len(ran) == 2:
update(int(ran[1]),int(ran[0]))
else:
update(int(ran[0]))
elif sys.argv[1] == 'feed':
feed()
elif sys.argv[1] == 'hot':
hot()
elif sys.argv[1] == 'list':
dblist()
elif len(sys.argv) == 3:
if sys.argv[1] != 'fetch':
usage()
elif '~' in sys.argv[2]:
m = sys.argv[2].split('~')
for i in range(int(m[0]),int(m[1])+1):
q.put(i)
elif sys.argv[2].startswith("q="):
search(sys.argv[2][2:])
elif sys.argv[2].startswith("n="):
normal(sys.argv[2][2:])
elif sys.argv[2].startswith("r="):
request(sys.argv[2][2:])
elif sys.argv[2].startswith("a="):
all(sys.argv[2][2:])
elif '-' in sys.argv[2]:
fetchall(sys.argv[2])
else:
fetch(int(sys.argv[2]),debug=True)
# wait all threads done
q.join()