#!/bin/env python3
""" Script to parse a sitemap.xml file,
then look through a NGINX log file for the number of hits for each of the URLs
defined in the sitemap, by unique IP.
"""
import re
import json
import argparse
from itertools import repeat
from subprocess import run
from urllib.parse import urlparse
import xml.etree.ElementTree as ET


def parse_args():
    parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
    parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
                        help="Path to the sitemap xml file for the website.")
    parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
                        help="Path to the log file to analyze")
    parser.add_argument("-e", "--exclude-crawler", action="store_true",
                        help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.")
    return parser.parse_args()

def main():
    """ Parses the arguments, the crawler file and the sitemap,
    then for each locations, uses grep to select the lines containing GET calls for
    the location, and prints the number of unique IP accessing it.
    """
    args = parse_args()

    if args.exclude_crawler:
        try:
            with open("./crawler-user-agents.json", 'r') as crawler_file:
                crawlers = json.load(crawler_file)
        except (FileNotFoundError, json.JSONDecodeError):
            print("Could not open the crawler user agent file")
            crawlers = []
    else:
        crawlers = []
    # Crawlers patterns are built once and for all for speed
    crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers]

    locations = []
    tree = ET.parse(args.sitemap)
    root = tree.getroot()
    # Get the default XML namespace, needed for tag lookup later
    ns = re.match(r'{.*}', root.tag).group(0)
    for url in root:
        locations.append(urlparse(url.find(f"{ns}loc").text).path)

    for path in locations:
        # Pre-process log file using grep, to keep only interesting lines
        cmd = ["grep", "-e", f'GET {path} ', args.logfile]
        process = run(cmd, capture_output=True, text=True)
        # Silmutaneously keep only unique source IP and exclude crawlers if resquested
        lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
        
        print(f"{path}: {len(lines)}")

if __name__ == "__main__":
    main()