#!/usr/bin/env python # -*- coding: utf-8 -*- # # download.py: download with report # # author: observer # email: jingchaohu@gmail.com # blog: http://obmem.com # last edit @ 2009.12.23 import os,sys import urllib,urllib2,cookielib import re from time import time,sleep import mechanize #path = os.path.dirname(os.path.realpath(sys.argv[0])) path = '/var/www/simplecd.old' #dproxy = 'http://69.163.35.68:3128' #dproxy = 'http://stdyun.com:41300' dproxy = 'http://localhost:3128' #dproxy = 'http://206.220.172.213:3128' #dproxy = 'http://184.82.232.152:3128' islogin = False isproxy = False br = None def useproxy(proxy=dproxy): proxies = {'http':proxy} proxy_support = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) global isproxy isproxy = True def login(): global br print 'try to login...' proxies = {'http':dproxy} br = mechanize.Browser() br.set_proxies( proxies ) print '...getting login form...' postdata=urllib.urlencode({'username':'wilken_h@163.com', 'password':'weijian23885430', 'continue':'http://www.verycd.com/', 'login_submit':'登录', 'save_cookie':1, }) global islogin islogin = True #functions def report(blocknum, bs, size, t): if t == 0: t = 1 if size == -1: print '%10s' % (str(blocknum*bs)) + ' downloaded | Speed =' + '%5.2f' % (bs/t/1024) + 'KB/s' else: percent = int(blocknum*bs*100/size) print '%10s' % (str(blocknum*bs)) + '/' + str(size) + 'downloaded | ' + str(percent) + '% Speed =' + '%5.2f'%(bs/t/1024) + 'KB/s' def httpfetch(url, headers={}, reporthook=report, postData=None, report=True, needlogin=False): ok = False if 'counters' not in url: headers['Accept-Encoding']='gzip,deflate' if (not islogin) and needlogin: login() if (not isproxy) and (not islogin): useproxy() for _ in range(3): try: reqObj = urllib2.Request(url, postData, headers) fp = urllib2.urlopen(reqObj) headers = fp.info() ok = True break except: sleep(0.5) continue if not ok: open(path+'/errors','a').write(url+'\n') return '' rawdata = '' bs = 1024*8 size = -1 read = 0 blocknum = 0 if reporthook and report: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size, 1) t0 = time() while 1: block = '' try: block = fp.read(bs) except: open(path+'/errors','a').write(url+'\n') return '' if block == "": print '...',url,'downloaded' break rawdata += block read += len(block) blocknum += 1 if reporthook and report: reporthook(blocknum, bs, size, time()-t0) t0 = time() # raise exception if actual size does not match content-length header if size >= 0 and read < size: return '' #raise ContentTooShortError("retrieval incomplete: got only %i out " # "of %i bytes" % (read, size), result) if 'counters' not in url: try: import StringIO compressedstream = StringIO.StringIO(rawdata) import gzip gzipper = gzip.GzipFile(fileobj=compressedstream) data = gzipper.read() except: data = rawdata else: data = rawdata return data if __name__ == '__main__': #login() postdata=urllib.urlencode({'username':'wilken_h@163.com', 'password':'weijian23885430', 'continue':'http://www.verycd.com/', 'login_submit':'登录', 'save_cookie':1, }) req = mechanize.Request("http://secure.verycd.com/signin",postdata) resp = mechanize.urlopen(req) print resp.info() print resp.read()