CloudScraper.py

from argparse import ArgumentParser
from multiprocessing import Pool
from termcolor import colored
from rfc3987 import parse
import itertools
import requests
import urllib3
import sys
import re


def print_banner():
        print('''\nCloudScraper is a tool to search through the source code of websites in order to find cloud resources belonging to a target.
        by Jordan Potti
        @ok_bye_now\n'''
        )


def checker(url):
    '''
    Check if the url is a valid one or not.
    '''
    try:
        parse(url)
        return True
    except ValueError:
        return False
    return False

def gather_links(html):
    '''
    Apply to the raw HTML a regular expression to gather all the urls.
    '''
    urls = []
    links_ = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)
    urls.extend(filter(checker, links_)) #filter the ones that don't compile with the checker function

    del(links_)
    return list(set(urls))

def start(target):
    '''
        Load the initial url and gather the first urls that will be used
        by the spider to keep looking for more links
    '''
    print(colored("Beginning search for cloud resources in {}".format(target), color='cyan'))

    try:
        html = requests.get(target, allow_redirects=True, headers=headers, verify=arguments.no_verify).text
        links = gather_links(html)

    except requests.exceptions.RequestException as e:
        if arguments.v:
            print(colored('Network error: {}'.format(e), 'red', attrs=['bold']))
        return

    print(colored('Initial links: {}\n'.format(len(links)), color='cyan'))
    spider(links, target)

def worker(url):
    '''
        Function handling all the crawling action of the spider.
        It first checks the desired depth and if the domain of
        the url matches the target to avoid crawling other web sites.
        Makes a GET request, parses the HTML and returns all the links.
    '''
    if url.count("/") <= arguments.depth+2:
        try:
            html = requests.get(url, allow_redirects=True, headers=headers, verify=arguments.no_verify).text
            links = gather_links(html)

        except requests.exceptions.RequestException as e:
            if arguments.v:
                print(colored('Network error: {}'.format(e), 'red', attrs=['bold']))
            return []

        print('{} links found [{}]'.format(len(links), url))
        return links

    else:
        return []
        

def spider(base_urls, target):
    '''
        Loop through the initial links found in the given page. Each new link
        discovered will be added to the list if it's not already there, and thus
        crawled aswell looking for more links.

        wannabe list works as the placeholder for the urls that are yet to crawl.
        base_urls is a list with all the already crawled urls.
    '''
    global target_
    target_ = parse(target)
    p = Pool(arguments.process)
    wannabe = [url for url in base_urls if target_['authority'] in parse(url)['authority']]

    while True:
        #retrieve all the urls returned by the workers
        new_urls = p.map(worker, wannabe)
        #flatten them and remove repeated ones
        new_urls = list(set(itertools.chain(*new_urls)))
        wannabe = []
        i = 0

        #if new_urls is empty meaning no more urls are being discovered, exit the loop
        if new_urls == []:
            break
        
        else:
            for url in new_urls:
                if url not in base_urls:
                    '''
                    For each new url, check if it hasn't been crawled. If it's 
                    indeed new and contains the target domain it gets appended to 
                    the wannabe list so in the next iteration it will be crawled. 
                    '''
                    i += 1
                    if target_['authority'] in parse(url)['authority']:
                        wannabe.append(url)
                    base_urls.append(url)
        
        print(colored('\nNew urls appended: {}\n'.format(i), 'green', attrs=['bold']))

    p.close()
    p.join()

    #once all the links for the given depth have been analyzed, execute the parser
    parser(base_urls)


def parser(links):
    '''
        Once all the links have been gathered check how many of them
        match with the list of cloud domains we are interested in.
    '''
    print(colored('Parsing results...', 'cyan', attrs=['bold']))
    cloud_domains = ['amazonaws.com', 'digitaloceanspaces.com', 'windows.net', 'storage.googleapis.com', 'aliyuncs.com']
    matches = []

    [[matches.append(link) for link in links if cloud_domain in link] for cloud_domain in cloud_domains]
    matches = list(set(matches))
    
    print('\nTotal links: ', len(links))
    if len(matches) == 0:
        print(colored("There were no matches!", 'red', attrs=['bold']))
    
    else:
        print(colored("There were {} matches for this search!".format(len(matches)), 'green', attrs=['bold']))
        [print(match, "\n") for match in matches]


def args():
    parser = ArgumentParser()
    parser.add_argument("-u", dest="URL", required=False, help="Target Scope") 
    parser.add_argument("-d", dest="depth", type=int, required=False, default=5, help="Max Depth of links Default: 5")
    parser.add_argument("-l", dest="targetlist", required=False, help="Location of text file of Line Delimited targets") 
    parser.add_argument("-v", action="store_true", default=False, required=False, help="Verbose output")
    parser.add_argument("-p", dest="process", required=False, default=2, type=int, help="Number of processes to run")
    parser.add_argument("--no-verify", action="store_false", default=True, required=False, help="Skip TLS verification")
    if len(sys.argv) == 1:
        parser.error("No arguments given.")
        parser.print_usage
        sys.exit()

    #ouput parsed arguments into a usable object
    return parser.parse_args()


def cleaner(url):
    if 'http' not in url:
        return ("https://"+url).strip()
    else:
        return url.strip()


def main():

    if arguments.targetlist:
        with open (arguments.targetlist, 'r') as target_list:
            [start(cleaner(line)) for line in target_list]
    else:
        start(cleaner(arguments.URL))


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
arguments = args()

# If we passed --no-verify then we likely don't care about insecure request warnings.
if arguments.no_verify:
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

if __name__ == '__main__':
    print_banner()
    main()