#!/bin/env python3 """ Script to parse a sitemap.xml file, then look through a NGINX log file for the number of hits for each of the URLs defined in the sitemap, by unique IP. """ import re import json import argparse from itertools import repeat from subprocess import run from urllib.parse import urlparse import xml.etree.ElementTree as ET def parse_args(): parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.') parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml", help="Path to the sitemap xml file for the website.") parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1", help="Path to the log file to analyze") parser.add_argument("-e", "--exclude-crawler", action="store_true", help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.") return parser.parse_args() def main(): """ Parses the arguments, the crawler file and the sitemap, then for each locations, uses grep to select the lines containing GET calls for the location, and prints the number of unique IP accessing it. """ args = parse_args() if args.exclude_crawler: try: with open("./crawler-user-agents.json", 'r') as crawler_file: crawlers = json.load(crawler_file) except (FileNotFoundError, json.JSONDecodeError): print("Could not open the crawler user agent file") crawlers = [] else: crawlers = [] # Crawlers patterns are built once and for all for speed crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers] locations = [] tree = ET.parse(args.sitemap) root = tree.getroot() # Get the default XML namespace, needed for tag lookup later ns = re.match(r'{.*}', root.tag).group(0) for url in root: locations.append(urlparse(url.find(f"{ns}loc").text).path) for path in locations: # Pre-process log file using grep, to keep only interesting lines cmd = ["grep", "-e", f'GET {path} ', args.logfile] process = run(cmd, capture_output=True, text=True) # Silmutaneously keep only unique source IP and exclude crawlers if resquested lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))} print(f"{path}: {len(lines)}") if __name__ == "__main__": main()