#coding:utf-8
import selenium.webdriver
import re
import sqlite3
import MySQLdb
import time
from fetchvc_noth import clear_idcache,cache_image,fetch
#import fetchvc
username='wilken_h@163.com'
password='weijian23885430'
path = '/var/www/simplecd.old'
conn = sqlite3.connect(path+'/verycd.sqlite3.db')
conn.text_factory = str
dbl = sqlite3.connect(path+'/lock.sqlite3.db')
dbl.text_factory = str
mysqldb = statdb = MySQLdb.connect(db='simplecd',user='root',passwd='guess8')
debug= True
class adv_vc_fetcher:
def __init__(self):
#self.br = selenium.webdriver.firefox.webdriver.WebDriver()
#self.br.set_proxies({'http':'127.0.0.1:3128'})
#self.br.set_handle_robots(False)
self.resp = None
def login(self):
print "getting homepage..."
#self.br.get("http://secure.verycd.com/signin")
#self.br.find_element("name","username").send_keys(username)
#self.br.find_element("name","password").send_keys(password)
print "submitting login info..."
#self.br.find_element("name","login_submit").click()
def fetch(self,diff=3600):
c = mysqldb.cursor()
c.execute('select verycdid from verycd')
allids = [ x[0] for x in c.fetchall() ]
ididx = ''
ididx = open("/root/recent").readlines()
idss = [ x.strip().split(',')[1:] for x in ididx ]
for ids in idss[2263:]:
for vid in ids:
try:
vid = int(vid)
except:
continue
if vid in allids:
print vid,'exists'
continue
print "getting id",vid
#fetchvc.q.put(vid)
fetch(vid,conn=conn,dbl=dbl,statdb=statdb,debug=True,cache=False)
def parse(self,id,res):
abstract = re.compile(r'
]*>(.*?)',re.DOTALL).findall(abstract)
if title:
title=title[0]
else:
return
try:
status = re.compile(r'requestIcon"[^>]*>\s*]*>(.*?)<',re.DOTALL).search(abstract).group(1)
brief = re.compile(r'摘要.*?(.*?)',re.DOTALL).search(abstract).group(1)
brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip()
pubtime = re.compile(r'时间.*?.*?date-time" title="(.*?)".*?date-time" title="(.*?)"',re.DOTALL).findall(abstract)[0]
category = re.compile(r'top;">分类.*?.*?top;">.*?>(.*?).*?>(.*?)',re.DOTALL).findall(abstract)
if not category:
category = re.compile(r'top;">分类.*?.*?top;">\s*(.*?)\s+(.*?)\s*',re.DOTALL).findall(abstract)
category = list(category[0])
category[0] = category[0].replace(' ','').replace('\xc2\xa0','')
category[1] = category[1].replace(' ','').replace('\xc2\xa0','')
category[0] = category[0].strip()
category[1] = category[1].strip()
#ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)[^<]*>([^<]*)',re.DOTALL).findall(res)
ed2k = []
ed2k.extend( re.compile(r'ed2k="([^"]*)"[^>]*>([^<]*)',re.DOTALL).findall(res) )
#delete duplicates
newed2k = ed2k
for i in range(len(ed2k)-1,-1,-1):
if ed2k[i] in ed2k[:i]:
newed2k.remove(ed2k[i])
content = re.compile(r'(',re.DOTALL).findall(res)
except Exception as what:
print 'parse1',what
return
if content:
content = content[0]
content = re.compile(r'<(/?OBJECT.*?)>',re.DOTALL).sub(r'[\1]',content)
content = re.compile(r'<(/?PARAM.*?)>',re.DOTALL).sub(r'[\1]',content)
content = re.compile(r'<(/?EMBED.*?)>',re.DOTALL).sub(r'[\1]',content)
content = re.compile(r'<(img .*?)>').sub(r'[\1]',content)
content = re.compile(r'
',re.DOTALL).sub('\n',content)
content = re.compile(r'<.*?>',re.DOTALL).sub('',content)
content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content)
content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content)
content = re.compile(r'\[(img .*?)\]').sub(r'<\1>
',content)
content = re.compile(r'\[(/?OBJECT.*?)\]').sub(r'<\1>',content)
content = re.compile(r'\[(/?PARAM.*?)\]').sub(r'<\1>',content)
content = re.compile(r'\[(/?EMBED.*?)\]').sub(r'<\1>',content)
content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content)
content = content.strip()
imglinks = re.compile(r'src="(http://image-\d*\.app-base\.com/[^"]*)"',re.I).findall(content)
try:
cache_image(imglinks)
except:
pass
else:
content=''
vcpv=0
owner = re.compile(r'''"/members/@u\d+/">([^<]+)''',re.M).findall(res)
if owner:
owner = owner[0]
cl=dbl.cursor()
try:
cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv))
except Exception,what:
print what
while True:
try:
dbl.commit()
break
except Exception,what:
print what
cl.close()
if debug:
if vcpv:
print vcpv
if owner:
print owner
print title
print status
print brief
print pubtime[0],pubtime[1]
print category[0],category[1]
for x in ed2k:
print x
print content
ed2kstr = ''
for x in ed2k:
ed2kstr += '`'.join(x)+'`'
if ed2kstr == '':
ed2kpage = re.compile(r'"(http://www\.verycd\.com/search/files/.*?rel)"').findall(res)
if ed2kpage:
url = ed2kpage[-1]
self.br.get(url)
ed2kpage = self.br.get_page_source()
starts = re.compile(r'''javascript:generateUrl\('start',(\d+)\)''').findall(ed2kpage)
for start in set(starts):
self.br.get(url+"&start=%s"%start)
ed2kpage += self.br.get_page_source()
ed2kpage = ed2kpage.encode('utf-8')
ed2ks = re.compile(r'ed2k://.*?\|/').findall(ed2kpage)
ed2ks = set(ed2ks)
ed2ks = sorted(list(ed2ks))
for ed2k in ed2ks:
ed2kstr += ed2k + '`' + ed2k.split('|')[2] + '`'
print ed2kstr
if not ed2kstr:
return
else:
return
clear_idcache(id)
# mysql
c = statdb.cursor()
try:
c.execute("replace into verycd values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'')
)
statdb.commit()
except Exception,what:
print 'mysql',what
c.close()
#sqlite
c = conn.cursor()
try:
c.execute("replace into verycd values (?,?,?,?,?,?,?,?,?,?,?)",
(id,title,status,brief,pubtime[0],pubtime[1],category[0],category[1],ed2kstr,content,'')
)
conn.commit()
except Exception,what:
conn.commit()
print 'sqlite3',what
c.close()
def debug(self):
print self.br.get_page_source()
if __name__=='__main__':
f = adv_vc_fetcher()
f.login()
import os,sys
if len(sys.argv)>1:
f.fetch(sys.argv[1])
else:
f.fetch(2400)
#f.parse(5420,open("test").read())