Quantcast
Channel: Security Tools – Security List Network™
Viewing all articles
Browse latest Browse all 443

cloudget v0.72 released – python script to bypass cloudflare from command line.

$
0
0

python script to bypass cloudflare from command line, with extensive scraping, link harvesting, and recursive directory downloading. built upon cfscrape module.

python script to bypass cloudflare from command line

python script to bypass cloudflare from command line

install using pip:
– pip install cfscrape
– pip install BeautifulSoup
Support All OS/Platform
cloudget.py Script:

#!/usr/bin/env python
# cloudget v0.72
# release date: October 21, 2015
# author: vvn < lost @ nobody . ninja >

import sys, argparse, codecs, subprocess, os, re, random, requests, string, time, traceback
from datetime import date, datetime
from urlparse import urlparse
from subprocess import PIPE, check_output, Popen

try:
   import cfscrape
except:
   pass
   try:
      os.system('pip install cfscrape')
      import cfscrape
   except:
      print('unable to install the cfscrape module via pip. this script requires cfscrape to run. get it here: https://github.com/Anorov/cloudflare-scrape')
      sys.exit(1)

intro = '''\n
\033[40m\033[34m=============================================================\033[0m
\033[40m\033[32m=============================================================\033[0m
\033[40m\033[90;1m---------------------- CLOUDGET v0.72 -----------------------\033[0m
\033[40m\033[34;21m=============================================================\033[0m
\033[40m\033[32m=============================================================\033[0m
\033[40m\033[35;1m----------------------- author : vvn ------------------------\033[0m
\033[40m\033[35m--------------- lost [at] nobody [dot] ninja ----------------\033[0m
\033[40m\033[34;1m=============================================================\033[0m
\033[40m\033[37;21m---------------- support my work: buy my EP! ----------------\033[0m
\033[40m\033[35m-------------------- http://dreamcorp.us --------------------\033[0m
\033[40m\033[35m--------------- facebook.com/dreamcorporation ---------------\033[0m
\033[40m\033[37;1m------------------ thanks for the support! ------------------\033[0m
\033[40m\033[34;1m=============================================================\033[0m
\033[21m\n'''

if os.name == 'nt' or sys.platform == 'win32':
   intro = '''\n
   =============================================================
   =============================================================
   ---------------------- CLOUDGET v0.72 -----------------------
   =============================================================
   =============================================================
   ----------------------- author : vvn ------------------------
   --------------- lost [at] nobody [dot] ninja ----------------
   =============================================================
   ---------------- support my work: buy my EP! ----------------
   -------------------- http://dreamcorp.us --------------------
   --------------- facebook.com/dreamcorporation ---------------
   ------------------ thanks for the support! ------------------
   =============================================================
   \n'''

print(intro)

try:
   from bs4 import BeautifulSoup, UnicodeDammit
except:
   pass
   try:
      os.system('pip install BeautifulSoup')
      from bs4 import BeautifulSoup
   except:
      print('BeautifulSoup module is required to run the script.')
      sys.exit(1)

global cfurl
global usecurl
global writeout
global depth
global useproxy
global debug
global depth
global finished
global firsturl
usecurl = 0
writeout = 0
depth = 0
useproxy = 0
debug = 0
depth = 0
links = 0
finished = []

parser = argparse.ArgumentParser(description="a script to automatically bypass anti-robot measures and download links from servers behind a cloudflare proxy")

parser.add_argument('-u', '--url', action='store', help='[**REQUIRED**] full cloudflare URL to retrieve, beginning with http(s)://', required=True)
parser.add_argument('-o', '--out', help='save returned content to \'download\' subdirectory', action='store_true', required=False)
parser.add_argument('-l', '--links', help='scrape content returned from server for links', action='store_true', required=False)
parser.add_argument('-c', '--curl', nargs='?', default='empty', const='curl', dest='curl', metavar='CURL_OPTS', help='use cURL. use %(metavar)s to pass optional cURL parameters. (for more info try \'curl --manual\')', required=False)
parser.add_argument('-p', '--proxy', action='store', metavar='PROXY_SERVER:PORT', help='use a proxy to connect to remote server at [protocol]://[host]:[port] (example: -p http://localhost:8080) **only use HTTP or HTTPS protocols!', required=False)
parser.add_argument('-d', '--debug', help='show detailed stack trace on exceptions', action='store_true', required=False)
parser.add_argument('--version', action='version', version='%(prog)s v0.70 by vvn <lost@nobody.ninja>, released October 12, 2015.')

args = parser.parse_args()
if args.out:
   writeout = 1
if args.links:
   links = 1
if args.debug:
   debug = 1
if args.proxy:
   useproxy = 1
   proxy = args.proxy
   if not re.search(r'^(http[s]?|socks(4[a]?|5)?)', proxy):
      print("\ninvalid argument supplied for proxy server. must specify as [protocol]://[server]:[port], where [protocol] is either http or https. (for example, http://127.0.0.1:8080) \n")
      sys.exit(1)
   x = urlparse(args.proxy)
   proxyhost = str(x.netloc)
   proxytype = str(x.scheme)
if args.curl in 'empty':
   usecurl = 0
elif args.curl is 'curl':
   usecurl = 1
else:
   usecurl = 1
   global curlopts
   curlopts = args.curl

cfurl = args.url
firsturl = cfurl

print("\nURL TO FETCH: %s \n" % cfurl)

if 'proxy' in locals():
   if 'https' in proxytype:
      proxystring = {'https': '%s' % proxyhost}
   else:
      proxystring = {'http': '%s' % proxyhost}
   print("using %s proxy server: %s \n" % (str(proxytype.upper()), str(proxyhost)))
else:
   proxystring = None
   print("not using proxy server \n")

if not re.match(r'^http$', cfurl[:4]):
   print("incomplete URL provided: %s \r\ntrying with http:// prepended..")
   cfurl = "http://" + cfurl

depth = 0

def getCF(cfurl, links):

   checkcurl = ''
   checklinks = ''
   if links == 1:
      checklinks = 'yes'
      global followdirs
   else:
      checklinks = 'no'
   if usecurl == 1:
      checkcurl = 'yes'
   else:
      checkcurl = 'no'

   if debug == 1:
      print("\n\033[32;1mlocals: \n\033[0m")
      for name, val in locals().iteritems():
         print("\033[35;1m%s:\033[32;21m %s \033[0m" % (str(name), str(val)))
      print("\n\033[32;1mglobals: \n\033[0m")
      for name, val in globals().iteritems():
         print("\n\033[35;1m%s:\033[36;21m %s \033[0m" % (str(name), str(val)))
      print('\033[0m\r\n')
      print("\n\033[31;1musing curl:\033[31;21m\033[33m %s \033[0m\n" % checkcurl)
      print("\n\033[34;1mharvesting links:\033[34;21m\033[33m %s \033[0m\n" % checklinks)

   p = urlparse(cfurl)
   part = p.path.split('/')[-1]
   path = p.path.strip(part)
   if '/' not in path[:1]:
      path = '/' + path
   urlfqdn = p.scheme + '://' + p.netloc
   parent = urlfqdn + path
   childdir = path.strip('/')
   domaindir = os.path.join('download', p.netloc)
   parentdir = os.path.join(domaindir, childdir)
   
   if firsturl in finished and cfurl in firsturl:
      print('\nABORTING: already retrieved %s!\n') % firsturl
      sys.exit(1)

   global outfile
   outfile = cfurl.split('?')[0]
   outfile = outfile.split('/')[-1]

   if writeout == 1:
      global existing
      global checkresume
      p = urlparse(cfurl)
      if not os.path.exists('download'):
         os.makedirs('download')
      if not os.path.exists(domaindir):
         os.makedirs(domaindir)
      filename = cfurl.lstrip('https:').strip('/')
      filename = filename.rstrip(outfile)
      dirs = filename.split('/')
      a = 'download'
      i = 1
      for dir in dirs:
         while i < len(dirs):
            if not re.search(r'^(.*)\.[.]+$', dir):
               a = os.path.join(a, dir)
               if not os.path.exists(a):
                  os.makedirs(a)
               i += 1
            else:
               break
      if len(outfile) < 1 or outfile in p.netloc:
         outfile = 'index.html'
         outdir = filename.strip()
      else:
         part = outfile
         outdir = filename.rstrip(part)
      fulloutdir = os.path.join('download', outdir)
      outfile = outfile.strip('/')
      if not os.path.exists(fulloutdir):
         os.makedirs(fulloutdir)
      print("output file: %s \n" % outfile)
      global savefile
      savefile = os.path.join(fulloutdir, outfile)
      cwd = os.getcwd()
      fullsavefile = os.path.join(cwd, savefile)
      print("full path to output file: %s \n" % fullsavefile)
      
   else:
      if len(outfile) < 1 or outfile in p.netloc:
         outfile = 'index.html'

   scraper = cfscrape.create_scraper()
   ualist = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)',
'Mozilla/5.0 (Windows NT 6.3; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
'Mozilla/5.0 (X11; SunOS i86pc; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; FreeBSD amd64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; FreeBSD i386; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; Linux i586; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; OpenBSD amd64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; OpenBSD alpha; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; OpenBSD sparc64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20121202 Firefox/17.0 Iceweasel/17.0.1',
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14 Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14'
]
   n = random.randint(0,len(ualist)) - 1
   ua = ualist[n].strip()
   
   def cfcookie(cfurl):
      sess = requests.session()
      p = urlparse(cfurl)
      mnt = p.scheme + '://'
      sess.mount(mnt, cfscrape.CloudflareAdapter())
      sess.get(cfurl)
      #sess.cookies
      l = sess.get(cfurl)
      b = sess.cookies
      if b:
         c = b.items()
         for s, t in c:
            cs = u''.join(s).encode('utf-8').strip()
            ct = u''.join(t).encode('utf-8').strip()
            print('\033[34;1m' + str(cs) + '\033[0m')
            print('\033[32;1m' + str(ct) + '\033[0m')
         cookies = "\"cf_clearance\"=\"%s\"" % sess.cookies.get('cf_clearance')
         if sess.cookies.get('__cfduid'):
            cookies = cookies + ";\"__cfduid\"=\"%s\"" % sess.cookies.get('__cfduid')
      else:
         cookies = None
      return cookies

   def getpage(cfurl):      
      r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      if 'text' in r.headers.get('Content-Type'):
         #rt = unicode(r.content.lstrip(codecs.BOM_UTF8), 'utf-8')
         #rt = UnicodeDammit.detwingle(r.text)
         html = BeautifulSoup(r.text, "html.parser")
         print('\r\n--------------------------------------------------------\r\n')
         if debug == 1:
            orenc = str(html.original_encoding)
            print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
         bs = html.prettify(formatter=None)
         bsu = u''.join(bs).encode('utf-8').strip()
         print(bsu)
         print('\r\n--------------------------------------------------------\r\n')
      else:
         found = -1
      
      if debug == 1:
         print('\n\033[34mDEBUG: finished list length: \033[37;1m%d \033[0m\n' % len(finished))
         
   # cURL request - using cURL for cloudflare URLs doesn't seem to work
   if usecurl == 1:

      r = scraper.get(cfurl, stream=True, verify=False, allow_redirects=True, proxies=proxystring)
      print("status: ")
      print(r.status_code)
      print("\ngetting cookies for %s.. \n" % cfurl)
      req = "GET / HTTP/1.1\r\n"
      cookie_arg = cfcookie(cfurl)
      if cookie_arg:
         req += "Cookie: %s\r\nUser-Agent: %s\r\n" % (cookie_arg, ua)
         houtput = check_output(["curl", "--cookie", cookie_arg, "-A", ua, "-s", cfurl])
         curlstring = '--cookie \'' + cookie_arg + '\' -A \'' + ua + '\' -k '
         if 'curlopts' in locals():
            curlstring = '--cookie \'' + cookie_arg + '\' ' + curlopts + ' -A \'' + ua + '\' -k '
      else:
         cookie_arg = cfscrape.get_cookie_string(cfurl)
         curlstring = '-A \'' + ua + '\' -k '
         if 'curlopts' in locals():
            curlstring = '-# ' + curlopts + ' -A \'' + ua + '\' -k '
         if proxy:
            curlstring += '-x %s ' % proxy
         if cookie_arg:
            curlstring += '--cookie \'' + cookie_arg + '\' '
            req += "Cookie: %s\r\nUser-Agent: %s\r\n" % (cookie_arg, ua)
            houtput = check_output(["curl", "-A", ua, "--cookie", cookie_arg, "-s", cfurl])
         else:
            req += "User-Agent: %s\r\n" % ua
            houtput = check_output(["curl", "-A", ua, "i", "-s", cfurl])
      
      print('\n\033[34;1msubmitting headers:\n\033[21m\033[37m%s \033[0m\n' % req)
      print("\nRESPONSE: \n%s \n" % str(houtput))
      msg = "\nfetching %s using cURL.. \n" % cfurl
      if writeout == 1:
         if os.path.exists(savefile):
            resumesize = os.path.getsize(savefile)
            print("\n%s already exists! \n" % outfile)
            print("\nlocal file size: %s bytes \n" % str(resumesize))
            if 'existing' not in globals():
               existing = 0
            if existing == 0:
               checkresume = raw_input('choose an option [1-3]: 1) resume download, 2) start new download, 3) skip. --> ')
               while not re.match(r'^[1-3]$', checkresume):
                  checkresume = raw_input('invalid input. enter 1 to resume, 2 to start new, or 3 to skip --> ')
               checkexist = raw_input('\ndo this for all downloads? Y/N --> ')
               while not re.match(r'^[YyNn]$', checkexist):
                  checkexist = raw_input('invalid entry. enter Y to use same action on existing files or N to always ask --> ')
               if checkexist.lower() == 'y':
                  existing = 1
               else:
                  existing = 0
            if checkresume == '1':
               curlstring = curlstring + '-C - -o \'' + savefile + '\' '
               msg = "\ntrying to resume download using cURL to %s.. \n" % savefile
            elif checkresume == '2':
               curlstring = curlstring + '-O '
               msg = "\nstarting new download to %s.. \n" % savefile
            else:
               msg = "\nskipping download for %s \n" % outfile
         else:
            curlstring = curlstring + '-O '
            msg = "\ntrying to download using cURL to %s.. \n" % savefile
         #command_text = 'cd download && { curl ' + curlstring + cfurl + ' ; cd -; }'
      else:
         msg = "\nfetching %s using cURL.. \n" % cfurl
      command_text = 'curl ' + curlstring + '-s ' + cfurl
      print(msg)
      print("\nsubmitting cURL command string: \n%s \n" % command_text)
      output = Popen(command_text, shell=True, stdout=PIPE, stderr=PIPE, stdin=PIPE)
      result, errors = output.communicate()
      if result is not None:
         if writeout == 1 and not re.search(r'(\.(htm)l?|\.php|\.txt|\.xml|\.[aj](sp)x?|\.cfm|\.do|\.md|\.json)$',outfile):
            print('\nsaved file: %s \n' % outfile)
         else:
            ht = BeautifulSoup(r.content, "html.parser")
            htpr = ht.prettify(formatter=None)
            htpr = u''.join(htpr).encode('utf-8').strip()
            print(htpr)
      else:
         if errors:
            print("\nerror: %s\n" % str(errors))
      finished.append(cfurl)

   elif usecurl == 0 and writeout == 1:
      getkb = lambda a: round(float(float(a)/1024),2)
      getmb = lambda b: round(float(float(b)/1048576),2)
      print("\ngetting %s... \n" % cfurl)
      if os.path.exists(savefile): # FOUND SAVED FILE
         # GET SIZE OF EXISTING LOCAL FILE
         resumesize = os.path.getsize(savefile)
         ksize = getkb(resumesize)
         msize = getmb(resumesize)
         sizeqt = 'kb'
         fsize = ksize
         if msize > 1:
            sizeqt = 'mb'
            fsize = msize
         existsize = str(fsize) + ' ' + sizeqt
         print("\n%s already exists! \n" % outfile)
         print("\nlocal file size: %s \n" % existsize)
         if 'existing' not in globals():
            existing = 0
         if existing == 0:
            checkresume = raw_input('choose an option [1-3]: 1) resume download, 2) start new download, 3) skip. --> ')
            while not re.match(r'^[1-3]$', checkresume):
               checkresume = raw_input('invalid input. enter 1 to resume, 2 to start new, or 3 to skip --> ')
            checkexist = raw_input('\ndo this for all downloads? Y/N --> ')
            while not re.match(r'^[YyNn]$', checkexist):
               checkexist = raw_input('invalid entry. enter Y to use same action on existing files or N to always ask --> ')
            if checkexist.lower() == 'y':
               existing = 1
            else:
               existing = 0

         if checkresume == '1': # RESUME DOWNLOAD AT LAST LOCAL BYTE
            dld = int(resumesize)
            resumeheader = {'Range': 'bytes=%s-' % str(dld)}
            dlmsg = "\nattempting to resume download for %s. this may take awhile depending on file size... \n" % outfile
            df = open(savefile, 'a+b')
         elif checkresume == '2': # DISREGARD SAVED FILE, START DOWNLOAD FROM TOP
            resumeheader = None
            dlmsg = "\nwriting content to \'download\' directory as file %s. this may take awhile depending on file size... \n" % outfile
            dld = 0
            df = open(savefile, 'wb+')
         else: # SKIPPING DOWNLOAD
            resumeheader = None
            df = open(savefile, 'r+')
            dlmsg = "\nskipping download for %s\n" % outfile

      else: # NEW DOWNLOAD REQUEST
         checkresume = '2'
         dld = 0
         df = open(savefile, 'wb+')
         resumeheader = None
         dlmsg = "\nwriting content to \'download\' directory as file %s. this may take awhile depending on file size... \n" % outfile

      print(dlmsg)

      if not checkresume == '3': # IF NOT SKIPPING
         r = scraper.get(cfurl, stream=True, headers=resumeheader, verify=False, allow_redirects=True, proxies=proxystring)
         filesize = r.headers.get('Content-Length')
         if checkresume == '1' and filesize is not None:
            filesize = int(filesize) + int(resumesize)
         filetype = r.headers.get('Content-Type')
         start = time.clock()
         #today = datetime.now()
         #startdate = date.strftime(today,"%m-%d-%Y %H:%M:%S ")
         #print("start time: %s \n" % startdate)
         with df as dlfile:
            if filesize is not None and 'text' not in filetype:
               bytesize = int(filesize)
               kbsize = getkb(bytesize)
               mbsize = getmb(bytesize)
               qt = 'bytes'
               size = bytesize
               if kbsize > 10:
                  qt = 'kb'
                  size = kbsize
                  if mbsize > 1 :
                     qt = 'mb'
                     size = mbsize
               print('\nfile size: %d %s \n' % (size, qt))
               for chunk in r.iter_content(chunk_size=4096):
                  if chunk:
                     dld += len(chunk)
                     dlfile.write(chunk)
                     done = int((50 * int(dld)) / int(filesize))
                     dldkb = getkb(dld)
                     dldmb = getmb(dld)
                     unit = 'b'
                     prog = str(round(dld,2))
                     if dldkb > 1:
                        unit = 'kb   '
                        prog = str(round(dldkb,2))
                        if dldmb > 1:
                           unit = 'mb   '
                           prog = str(round(dldmb,2))
                     sys.stdout.write("\rdownloaded: %s %s   [%s%s] %d kb/s\r" % (prog, unit, '#' * done, ' ' * (50 - done), 0.128 * (dldkb / (time.clock() - start))))
                     dlfile.flush()
                     os.fsync(dlfile.fileno())
                  else:
                     break
            elif filesize and 'text' in filetype:
               dlfile.write(r.content)
               dlfile.flush()
               os.fsync(dlfile.fileno())
            else:
               for chunk in r.iter_content(chunk_size=1024):
                  if chunk:
                     dld += len(chunk)
                     dlfile.write(chunk)
                     dlfile.flush()
                     os.fsync(dlfile.fileno())
                  else:
                     break
         print("\r\nfile %s saved! \n" % outfile)
         endclock = time.clock()
         fin = endclock - start
         totalsecs = fin
         if debug == 1:
            print("\n\033[34;1mSTART: \033[35;1m %s \033[0;21m\n" % str(start))
            print("\n\033[34;1mEND: \033[35;1m %s \033[0;21m\n" % str(endclock))
         elapsed = "%s seconds " % str(totalsecs)
         if totalsecs > 60:
            totalmins = float(totalsecs / 60)
            mins = int(totalmins)
            if mins == 1:
               unitmin = "minute"
            else:
               unitmin = "minutes"
            strmin = str(mins) + " " + str(unitmin)
            secs = round((totalsecs % 60), 4)
            elapsed = str(strmin) + " " + str(secs)
            if totalmins > 60:
               totalhours = float(totalmins / 60 )
               hours = int(totalmins / 60)
               if hours == 1:
                  unithr = "hour"
               else:
                  unithr = "hours"
               strhr = str(hours) + " " + str(unithr)
               mins = round((totalmins % 60),3)
               elapsed = "%s, %s mins, %s secs" % (strhr, mins, secs)
            else:
               hours = 0
         else:
            hours = 0
            mins = 0
            secs = round(totalsecs,3)
            elapsed = "%s seconds" % str(secs)
         #ended = datetime.now()
         #enddate = date.strftime(ended,"%m-%d-%Y %H:%M:%S ")
         #print("end time: %s \n" % enddate)
         print("\ndownload time elapsed: %s \n" % str(elapsed))
         time.sleep(4)
         print('\r\n--------------------------------------------------------\r\n')

      else:
         print("\nskipped download from %s.\r\nfile has not been modified.\n" % cfurl)
      
      getpage(cfurl)
      finished.append(cfurl)
      
   else:
      getpage(cfurl)
      finished.append(cfurl)

   def getlinks(cfurl):
      r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      html = BeautifulSoup(r.text, "html.parser")
      if debug == 1:
         orenc = str(html.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      bs = html.prettify(formatter=None)
      linkresult = html.findAll('a')
      if len(linkresult) > 0:
         foundlinks = len(linkresult)
         print('\nFOUND %s LINKS AT %s:\n' % (str(foundlinks), cfurl))
         for link in linkresult:
            b = link.get('href')
            b = str(b)
            if b not in cfurl and not re.match(r'^(\.\.)?\/$', b):
               print(b)
         print('')
      else:
         print('\nNO LINKS FOUND.\n')
         foundlinks = 0
      time.sleep(4)
      return foundlinks

   def selectdir(geturl):
      r = scraper.get(geturl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      html = BeautifulSoup(r.text, "html.parser")
      if debug == 1:
         orenc = str(html.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      findlinks = html.findAll('a')
      dirlist = []
      for link in findlinks:
         b = link.get('href')
         if not re.match(r'^((\.\.)?\/)$', str(b)):
            if re.search(r'^(.*)(\/)$', str(b)):
               dirlist.append(b)

      p = urlparse(geturl)
      part = p.path.split('/')[-1]
      path = p.path.rstrip(part)
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path

      i = 0
      dirtotal = len(dirlist)
      if dirtotal > 0:
         print('\nFOUND %d DIRECTORIES: \n' % dirtotal)
         while i < dirtotal:
            sel = i + 1
            print(str(sel) + ' - ' + str(dirlist[i]))
            i += 1
         print('')
         lim = dirtotal + 1
         matchtop = r'^(%s)(\/)?$' % urlfqdn
         if not re.match(matchtop,geturl):
            print('0 - BACK TO PARENT DIRECTORY \n')
            startsel = '0-%d' % dirtotal
         else:
            startsel = '1-%d' % dirtotal
         selectdir = raw_input('make a selection [%s] --> ' % startsel)
         if not int(selectdir) in range(0, lim):
            selectdir = raw_input('invalid entry. please enter a selection %s --> ' % startsel)
         if selectdir == '0':
            geturl = parent
            subcont = 0
         else:
            n = int(selectdir) - 1
            usedir = dirlist[n]
            geturl = parent + usedir
            subcont = 1
      else:
         print('\nNO DIRECTORIES FOUND. using current directory.. \n')
         subcont = 0
         geturl = parent + part
      return geturl, subcont, parent
      
   def getparent(cfurl):
      cff = re.match(r'^http:\/\/(.*)(\/\/)(.*)', cfurl)
      if cff:
         cf = 'http://' + str(cff.group(1)) + '/' + str(cff.group(3))
      else:
         cf = str(cfurl)
      p = urlparse(cf)
      if '/' not in p.path[-1:]:
         part = p.path.split('/')[-1]
         path = p.path.rstrip(part)
      else:
         path = p.path
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path + '/'
      return parent

   def followlinks(bx):
      p = urlparse(bx)
      if '/' not in p.path[-1:]:
         part = p.path.split('/')[-1]
         path = p.path.rstrip(part)
      else:
         path = p.path
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path + '/'
      s = scraper.get(bx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      print('\n----------------------------------------------------------- \n')
      print(s)
      print('\n')
      shtml = BeautifulSoup(s.text, "html.parser")
      if debug == 1:
         orenc = str(shtml.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      print('\n----------------------------------------------------------- \n')
      sfindlinks = shtml.findAll('a')
      slen = len(sfindlinks)
      sdirs = []
      si = 0
      while si < slen:
         for slink in sfindlinks:
            if debug == 1:
               print('\n\033[34;1mSLINK LOOP\r\n\033[32;21m* si = %d, si < %d\033[0m\n' % (si, slen))
            sl = slink.get('href')
            si += 1
            if sl:
               if not re.search(r'^((\.\.)?\/)$', str(sl)):
                  if '/' in bx[-1:]:
                     if 'http' not in sl[:4]:
                        sl = sl.lstrip('/')
                        sx = bx + sl
                     else:
                        sx = sl
                     print(sx)
                     getCF(sx, 0)
                     ss = scraper.get(sx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
                     bb = BeautifulSoup(ss.text, "html.parser")
                     if bb is not None:
                        if debug == 1:
                           orenc = str(bb.original_encoding)
                           print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
                        if bb.html is not None:
                           pagehead = bb.html.head.contents
                           if pagehead is not None and len(pagehead) > 1:
                              pagehead = u''.join(pagehead).encode('utf-8').strip()
                              pagetitle = re.search(r'<title>(.*)<\/title>', pagehead)
                              pagetitle = str(pagetitle.group(1))
                              bigtitle = pagetitle.upper()
                              titlestars = lambda a: '*' * (len(str(a)) + 4)
                              pagestars = titlestars(pagetitle)
                              print('\n\033[40m\033[33m%s\033[0m\n\033[34;1m* %s *\033[0m \n\033[40m\033[33;21m%s\033[0m\n' % (pagestars, bigtitle, pagestars))
                              
                        sb = bb.find_all('a', href = re.compile(r'.+$'))
                        sblen = len(sb)
                        if sblen > 0:
                           n = 0
                           while n < sblen:
                              for sbl in sb:
                                 if debug == 1:
                                    print('\n\033[35;1mSBL LOOP\r\n\033[37;21m* n = %d, n < %d \033[0m\n' % (n, sblen))
                                 if sbl is not None:
                                    sr = sbl.get('href').strip()
                                    sr = str(sr)
                                    print('\n* %s \n') % sr
                                    if not re.search('http', sr[:4]):
                                       parent = getparent(sx)
                                       srs = sr.lstrip('/')
                                       sr = parent + srs
                                    if re.match(r'([^.]+\/)$', str(sr)):
                                       followlinks(sr)
                                       sdirs.append(sr)
                                    else:
                                       if '/' not in sr[-1:]:
                                          getCF(sr, 0)
                                          sdirs.append(sr)
                                    n += 1
                                 else:
                                    n += 1
                                    continue
                     else:
                        n += 1
                        continue
                        
                  elif 'Error-222' in bx:
                     print('\nuh-oh. might have triggered a flag with cloudflare.\n')
                     for i in xrange(10,0,-1):
                        time.sleep(1)        
                        print('delaying request for %d seconds.. \r' % i)
                        sys.stdout.flush()
                     break
                  else:
                     if not re.search('http', str(sl[:4])):
                        parent = getparent(bx)
                        if '/' in sl[:1]:
                           sl = sl.lstrip('/')
                        sx = parent + sl
                     else:
                        sx = str(sl)

                  sx = str(sx)
                  sdirs.append(sx)
                  print(sx)
                  print('\n----------------------------------------------------------- \n')              
                  getCF(sx, 0)
               si += 1

               #if re.search(r'^(.*)(\/)$', str(bx)):
            else:
               print('\nno links found at %s \n' % str(slink))
               si += 1
               continue

      for sd in sdirs:
         if '/' in sd[-1:]:
            print('\nfollowing directory: %s \n' % sd)
            followlinks(sd)
            getCF(sd, 1)
         else:
            print('\nrequesting link: %s \n' % sd)
            getCF(sd, 0)
      return sdirs

   if links == 1:
      if 'found' not in locals():
         found = getlinks(cfurl)
         keep = 1
         depth = 0
      while found > 0 and keep is not 0:
         follow = raw_input('fetch harvested links? enter Y/N --> ')
         while not re.search(r'^[yYnN]$', follow):
            follow = raw_input('invalid entry. enter Y to follow harvested links or N to quit --> ')
         if follow.lower() == 'n':
            break
         elif follow.lower() == 'y':
            r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
            html = BeautifulSoup(r.text, "html.parser", from_encoding='utf-8')
            findlinks = html.findAll('a')
            s = []
            checkfordirs = 0
            if len(findlinks) > 0:
               for d in findlinks:
                  dd = d.get('href')
                  if re.search(r'^(.*)(\/)$', str(dd)):
                     if not re.match(r'^((\.\.)?\/)$', str(dd)) and dd not in cfurl:
                        if 'http' not in dd[:4]:
                           dd = parent + dd
                        s.append(str(dd))
                        checkfordirs = 1

            if len(s) > 0 and checkfordirs == 1:
               if 'followdirs' not in locals():
                  followdirs = raw_input('follow directories? enter Y/N --> ')
                  while not re.search(r'^[yYnN]$', followdirs):
                     followdirs = raw_input('invalid entry. enter Y to follow directories or N to only retrieve files --> ')
                  if followdirs.lower() == 'y':
                     depth = 1
                  else:
                     depth = 0
               else:
                  if followdirs.lower() == 'y':
                     depth += 1
            else:
               followdirs = 'n'

            if debug == 1:
               print("\n\033[35;1mdepth:\033[37;21m %d \033[0m\n" % depth)
            if findlinks:
               total = len(findlinks)
            else:
               total = 0
            if writeout == 1:
               if not os.path.exists(parentdir):
                  os.makedirs(parentdir)
            if total > 0:
               if followdirs.lower() == 'n':
                  for link in findlinks:
                     b = link.get('href')
                     if b:
                        if not re.search(r'^(.*)(\/)$', str(b)):
                           b = parent + b
                           print("\nrequesting harvested URL: %s \r\n(press CTRL + C to skip)\n" % b)
                           try:
                              getCF(b, links)
                           except KeyboardInterrupt:
                              try:
                                 print("\r\nskipping %s... press CTRL + C again to quit.\n" % b)
                                 continue
                              except KeyboardInterrupt:
                                 print("\nrequest cancelled.\n")
                                 break
                           except (KeyboardInterrupt, SystemExit):
                              print("\r\nrequest cancelled by user\n")
                              keep = 0
                              break
                           except Exception, e:
                              print("\r\nan exception has occurred: %s \n" % str(e))
                              raise
                        else:
                           continue
                     else:
                        break
                     total = total - 1
                  links = 1
               elif followdirs.lower() == 'y' and depth > 0:
                  choosedir = raw_input("choose subdirectory? Y/N --> ")
                  while not re.match(r'^[YyNn]$', choosedir):
                     choosedir = raw_input("invalid entry. enter Y to pick subdirectory or N to download everything --> ")
                  if choosedir.lower() == 'n':
                     links = 0
                     for link in findlinks:
                        b = link.get('href')
                        if b:
                           bx = parent + b

                           if not re.match(r'^((\.\.)?\/)$', str(b)):
                              getdirs = followlinks(bx)
                              while len(getdirs) > 0:
                                 for sd in getdirs:
                                    getdirs = followlinks(sd)
                              print("\nrequesting harvested URL: %s \r\n(press CTRL + C to skip)\n" % bx)
                              try:
                                 getCF(bx, links)
                                 if debug == 1:
                                    print("\nfound: %d \n" % found)
                              except KeyboardInterrupt:
                                 try:
                                    print("\r\nskipping %s... press CTRL + C again to quit.\n" % bx)
                                    continue
                                 except KeyboardInterrupt:
                                    print("\nrequest cancelled.\n")
                                    sys.exit()
                              except (KeyboardInterrupt, SystemExit):
                                 print("\r\nrequest cancelled by user\n")
                                 break
                              except Exception, e:
                                 print("\r\nan exception has occurred: %s \n" % str(e))
                                 raise
                                 sys.exit(1)
                     links = 1
                     found = found - 1
                  else:
                     subcont = 1
                     geturl = cfurl
                     while subcont is not 0:
                        depth += 1
                        if subcont < 1:
                           break
                        geturl, subcont, parent = selectdir(geturl)
                        if debug == 1:
                           print("\ndepth: %d \n" % depth)
                        checksubdir = raw_input("enter 1 to select this directory, 2 to choose a subdirectory, or 3 to go back to parent directory --> ")
                        while not re.match(r'^[1-3]$', checksubdir):
                           checksubdir = raw_input("invalid input. enter a value 1-3 --> ")
                        if checksubdir is not 2:
                           if checksubdir == '3':
                              p = urlparse(geturl)
                              droppath = p.path.split('/')[-1]
                              geturl = geturl.rstrip(droppath)
                           break

                     print('\nrequesting harvested URL: %s \r\n(press CTRL + C to skip) \n' % geturl)
                     try:
                        getCF(geturl, links)
                        found = found - 1
                     except KeyboardInterrupt:
                        try:
                           print("\r\nskipping %s... press CTRL + C again to quit.\n" % geturl)
                           continue
                        except KeyboardInterrupt:
                           print("\nrequest cancelled.\n")
                           break
                     except (KeyboardInterrupt, SystemExit):
                        print("\r\nrequest cancelled by user\n")
                        keep = 0
                        break
                     except Exception, e:
                        print("\r\nan exception has occurred: %s \n" % str(e))
                        raise
                        sys.exit(1)
                     finally:
                        depth -= 1
                        if debug == 1:
                           print("\ndepth: %d \n" % depth)

               elif followdirs.lower() == 'y' and depth < 1:
                  for link in findlinks:
                     b = link.get('href')
                     if not re.  match(r'^((\.\.)?\/)$', str(b)):
                        bx = parent + b
                        print("\nrequesting harvested URL: %s \r\n(press CTRL + C to skip)\n" % bx)
                        try:
                           getCF(bx, links)
                        except KeyboardInterrupt:
                           try:
                              print("\r\nskipping %s... press CTRL + C again to quit.\n" % bx)
                              continue
                           except KeyboardInterrupt:
                              print("\nrequest cancelled.\n")
                              break
                        except (KeyboardInterrupt, SystemExit):
                           print("\r\nrequest cancelled by user\n")
                           break
                        except Exception, e:
                           print("\r\nan exception has occurred: %s \n" % str(e))
                           raise
                           sys.exit(1)
                        finally:
                           links = 0
                     else:
                        continue
                     found = found - 1
                     if debug == 1:
                        print("\nfound: %d \n" % found)

               else:
                  for link in findlinks:
                     b = link.get('href')
                     links = 0
                     if debug == 1:
                        print("\nfound: %d \n" % found)
                     while b:
                        if not re.search(r'^(.*)(\/)$', str(b)):
                           b = parent + b
                           print("\nrequesting harvested URL: %s \r\n(press CTRL + C to skip)\n" % b)
                           try:
                              getCF(b, links)
                           except KeyboardInterrupt:
                              print("\r\nskipping %s...\n" % b)
                              continue
                           except (KeyboardInterrupt, SystemExit):
                              print("\r\nrequest cancelled by user\n")
                              break
                           except Exception, e:
                              print("\r\nan exception has occurred: %s \n" % str(e))
                              raise
                        else:
                           continue
                     if debug == 1:
                        found = found - 1
                     print("\nfound: %d \n" % found)
                  links = 1
            else:
               print("\ndid not find any links\n")
               found = found - 1
               if debug == 1:
                  print("\nfound: %d \n" % found)
               keep = 0
               break

         else:
            cpath = p.path.strip('/')
            cpaths = cpath.split('/')
            lastpath = cpaths[-1]
            if len(lastpath) < 1 and len(cpaths) > 1:
               lastpath = cpaths[-2]
            cfurl = cfurl.strip('/')
            matchtop = r'^(%s)$' % urlfqdn
            if found == 0:
               keep = 0
               print("\nfinished following all links.\n")
            break

      else:
         found = 0

         print("\nno more links to fetch at %s.\n" % cfurl)
         if debug == 1:
            print("\nfound: %d \n" % found)
         cpath = p.path.strip('/')
         cpaths = cpath.split('/')
         lastpath = cpaths[-1]
         if len(lastpath) < 1 and len(cpaths) > 1:
            lastpath = cpaths[-2]
         cfurl = cfurl.strip('/')
         urlfqdn = urlfqdn.strip('/')
         print(urlfqdn)
         matchtop = r'^(%s)$' % urlfqdn
         if re.match(matchtop, cfurl) and found == 0:
            keep = 0
            print("\nfinished following all links.\n")
         else:
            cfurl = cfurl.rstrip(lastpath)
            print('\ntrying %s.. \n' % cfurl)

try:
   getCF(cfurl, links)

except (KeyboardInterrupt, SystemExit):
   print("\r\nrequest cancelled by user\n")
   print("\r\nhit CTRL + C again to exit program, or it will automatically continue in 10 seconds.\n")
   try:
      time.sleep(10)
      getCF(cfurl, links)
   except KeyboardInterrupt:
      sys.exit("\nrequest cancelled by user.\n")
   except Exception, exc:
      print("\nan error has occurred: %s \n" % str(exc))
      sys.exit("unable to continue. check the URL and try again.\n")

except requests.exceptions.ConnectionError, e:
   print("\na connection error occurred: %s \n" % str(e))
   pass
   time.sleep(7)
   print("\nattempting to reconnect to %s...\n" % cfurl)
   try:
      getCF(cfurl, links)
   except Exception, exc:
      print("\nan exception has occurred %s \n" % str(exc))
      raise

except RuntimeError, e:
   print("\na runtime error has occurred: %s \n" % str(e))
   raise

except SyntaxError, e:
   print("\na typo is a silly reason to force a program to terminate..\n")
   print("\nespecially this one:\n %s \n" % str(e))
   raise

except IOError, e:
   print("\na connection error has occurred: %s \n" % str(e))
   pass
   time.sleep(7)
   print("\nattempting to reconnect to %s...\n" % cfurl)
   try:
      getCF(cfurl, links)
   except Exception, exc:
      print("\nan exception has occurred %s \n" % str(exc))
      print("unable to continue. please restart the program.\n")
      raise
      exit(1)

except Exception, e:
   print("\nan error has occurred: %s \n" % str(e))
   print("unable to continue. check the parameters and try again.\n")
   raise
   if debug == 1:
      traceback_template = '''Traceback (most recent call last):
      File "%(filename)s", line %(lineno)s, in %(name)s
   %(type)s: %(message)s\n'''
      traceback_details = {
                            'filename': sys.exc_info()[2].tb_frame.f_code.co_filename,
                            'lineno'  : sys.exc_info()[2].tb_lineno,
                            'name'    : sys.exc_info()[2].tb_frame.f_code.co_name,
                            'type'    : sys.exc_info()[0].__name__,
                            'message' : sys.exc_info()[1].message,
                           }
      print
      print(traceback.format_exc())
      #print(traceback.extract_tb(sys.exc_info()[2]))
      print(traceback_template % traceback_details)
   sys.exit(1)

print("\nexiting..\n")
sys.exit(0)

Source : https://github.com/eudemonics


Viewing all articles
Browse latest Browse all 443

Trending Articles