pages_stats/get_page_stats.py

#!/bin/env python3
""" Script to parse a sitemap.xml file,
then look through a NGINX log file for the number of hits for each of the URLs
defined in the sitemap, by unique IP.
"""
import re
import json
import argparse
from collections import defaultdict
from itertools import repeat
from subprocess import run
from urllib.parse import urlparse
import xml.etree.ElementTree as ET


def parse_args():
    parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
    parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml",
                        help="Path to the sitemap xml file for the website.")
    parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
                        help="Path to the log file to analyze")
    parser.add_argument("-e", "--exclude-crawler", action="store_true",
                        help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.")
    return parser.parse_args()

def main():
    """ Parses the arguments, the crawler file and the sitemap,
    Then reads the log file line by line, regexes through it to isolate locations and client IP
    It records the number of unique IP accessing each known pages (from the sitemap), and
    the number of unique IP accessing each unknown locations.
    (either ressources being loaded or bot looking for vulnerable website).
    """
    args = parse_args()

    if args.exclude_crawler:
        try:
            with open("./crawler-user-agents.json", 'r') as crawler_file:
                crawlers = json.load(crawler_file)
        except (FileNotFoundError, json.JSONDecodeError):
            print("Could not open the crawler user agent file")
            crawlers = []
    else:
        crawlers = []
    # Crawlers patterns are built once and for all for speed
    crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers]

    locations = []
    tree = ET.parse(args.sitemap)
    root = tree.getroot()
    # Get the default XML namespace, needed for tag lookup later
    ns = re.match(r'{.*}', root.tag).group(0)
    for url in root:
        locations.append(urlparse(url.find(f"{ns}loc").text).path)

    log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
    known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations))))
    other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
    visit_dict = dict(map(lambda x: (x, set()), locations))
    bot_visit_dict = dict(map(lambda x: (x, set()), locations))
    other_visits = defaultdict(set)
    with open(args.logfile, 'r') as logfile:
        for line in logfile:
            match_obj = re.match(known_page_regex, line)
            if match_obj:
                client_ip = match_obj.group("ip_address")
                location = match_obj.group("location")
                if not any(map(re.search, crawler_patterns, repeat(line))):
                    visit_dict[location].add(client_ip)
                else:
                    bot_visit_dict[location].add(client_ip)
            else:
                match_obj = re.match(other_pages_regex, line)
                if match_obj:
                    client_ip = match_obj.group("ip_address")
                    location = match_obj.group("location")
                    if location.startswith("/isso/"):
                        other_visits["/isso/*"].add(client_ip)
                    elif location.startswith("/assets/css/"):
                        other_visits["/assets/css/*"].add(client_ip)
                    elif location.startswith("/assets/js/"):
                        other_visits["/assets/js/*"].add(client_ip)
                    elif location.startswith("/images/"):
                        other_visits["/images/*"].add(client_ip)
                    else:
                        other_visits[location.split('?')[0]].add(client_ip)


    total_visits=0
    print("Standard visits:")
    for loc, ips in visit_dict.items():
        print(f"{loc}: {len(ips)}")
        total_visits += len(ips)
    print(f'Total visits: {total_visits}')
    if args.exclude_crawler:
        print("Bot visits:")
        for loc, ips in bot_visit_dict.items():
            print(f"{loc}: {len(ips)}")
    nb_other_visits = 0
    print("Other visits:")
    for loc, ips in other_visits.items():
        print(f"{loc}: {len(ips)}")
        nb_other_visits += len(ips)
    print(f'Total visits: {total_visits}')
    print(f'Other visits: {nb_other_visits}')

    #for path in locations:
    #    # Pre-process log file using grep, to keep only interesting lines
    #    cmd = ["grep", "-e", f'GET {path} ', args.logfile]
    #    process = run(cmd, capture_output=True, text=True)
    #    # Silmutaneously keep only unique source IP and exclude crawlers if resquested
    #    lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
    #
    #    print(f"{path}: {len(lines)}")

if __name__ == "__main__":
    main()