#!/bin/env python3 """ Script to parse a sitemap.xml file, then look through a NGINX log file for the number of hits for each of the URLs defined in the sitemap, by unique IP. """ import re import json import argparse from collections import defaultdict from itertools import repeat from subprocess import run from urllib.parse import urlparse import xml.etree.ElementTree as ET def parse_args(): parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.') parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml", help="Path to the sitemap xml file for the website.") parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1", help="Path to the log file to analyze") parser.add_argument("-e", "--exclude-crawler", action="store_true", help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.") return parser.parse_args() def main(): """ Parses the arguments, the crawler file and the sitemap, Then reads the log file line by line, regexes through it to isolate locations and client IP It records the number of unique IP accessing each known pages (from the sitemap), and the number of unique IP accessing each unknown locations. (either ressources being loaded or bot looking for vulnerable website). """ args = parse_args() if args.exclude_crawler: try: with open("./crawler-user-agents.json", 'r') as crawler_file: crawlers = json.load(crawler_file) except (FileNotFoundError, json.JSONDecodeError): print("Could not open the crawler user agent file") crawlers = [] else: crawlers = [] # Crawlers patterns are built once and for all for speed crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers] locations = [] tree = ET.parse(args.sitemap) root = tree.getroot() # Get the default XML namespace, needed for tag lookup later ns = re.match(r'{.*}', root.tag).group(0) for url in root: locations.append(urlparse(url.find(f"{ns}loc").text).path) log_line_template = r'^(?P[0-9a-f.:]+) .*"GET (?P{locations}) .*' known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations)))) other_pages_regex = re.compile(log_line_template.format(locations='.+?')) visit_dict = dict(map(lambda x: (x, set()), locations)) bot_visit_dict = dict(map(lambda x: (x, set()), locations)) other_visits = defaultdict(set) with open(args.logfile, 'r') as logfile: for line in logfile: match_obj = re.match(known_page_regex, line) if match_obj: client_ip = match_obj.group("ip_address") location = match_obj.group("location") if not any(map(re.search, crawler_patterns, repeat(line))): visit_dict[location].add(client_ip) else: bot_visit_dict[location].add(client_ip) else: match_obj = re.match(other_pages_regex, line) if match_obj: client_ip = match_obj.group("ip_address") location = match_obj.group("location") if location.startswith("/isso/"): other_visits["/isso/*"].add(client_ip) elif location.startswith("/assets/css/"): other_visits["/assets/css/*"].add(client_ip) elif location.startswith("/assets/js/"): other_visits["/assets/js/*"].add(client_ip) elif location.startswith("/images/"): other_visits["/images/*"].add(client_ip) else: other_visits[location.split('?')[0]].add(client_ip) total_visits=0 print("Standard visits:") for loc, ips in visit_dict.items(): print(f"{loc}: {len(ips)}") total_visits += len(ips) print(f'Total visits: {total_visits}') if args.exclude_crawler: print("Bot visits:") for loc, ips in bot_visit_dict.items(): print(f"{loc}: {len(ips)}") nb_other_visits = 0 print("Other visits:") for loc, ips in other_visits.items(): print(f"{loc}: {len(ips)}") nb_other_visits += len(ips) print(f'Total visits: {total_visits}') print(f'Other visits: {nb_other_visits}') #for path in locations: # # Pre-process log file using grep, to keep only interesting lines # cmd = ["grep", "-e", f'GET {path} ', args.logfile] # process = run(cmd, capture_output=True, text=True) # # Silmutaneously keep only unique source IP and exclude crawlers if resquested # lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))} # # print(f"{path}: {len(lines)}") if __name__ == "__main__": main()