diff --git a/get_page_stats.py b/get_page_stats.py new file mode 100644 index 0000000..6dcba4b --- /dev/null +++ b/get_page_stats.py @@ -0,0 +1,62 @@ +#!/bin/env python3 +""" Script to parse a sitemap.xml file, +then look through a NGINX log file for the number of hits for each of the URLs +defined in the sitemap, by unique IP. +""" +import re +import json +import argparse +from itertools import repeat +from subprocess import run +from urllib.parse import urlparse +import xml.etree.ElementTree as ET + + +def parse_args(): + parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.') + parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml", + help="Path to the sitemap xml file for the website.") + parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1", + help="Path to the log file to analyze") + parser.add_argument("-e", "--exclude-crawler", action="store_true", + help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.") + return parser.parse_args() + +def main(): + """ Parses the arguments, the crawler file and the sitemap, + then for each locations, uses grep to select the lines containing GET calls for + the location, and prints the number of unique IP accessing it. + """ + args = parse_args() + + if args.exclude_crawler: + try: + with open("./crawler-user-agents.json", 'r') as crawler_file: + crawlers = json.load(crawler_file) + except (FileNotFoundError, json.JSONDecodeError): + print("Could not open the crawler user agent file") + crawlers = [] + else: + crawlers = [] + # Crawlers patterns are built once and for all for speed + crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers] + + locations = [] + tree = ET.parse(args.sitemap) + root = tree.getroot() + # Get the default XML namespace, needed for tag lookup later + ns = re.match(r'{.*}', root.tag).group(0) + for url in root: + locations.append(urlparse(url.find(f"{ns}loc").text).path) + + for path in locations: + # Pre-process log file using grep, to keep only interesting lines + cmd = ["grep", "-e", f'GET {path} ', args.logfile] + process = run(cmd, capture_output=True, text=True) + # Silmutaneously keep only unique source IP and exclude crawlers if resquested + lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))} + + print(f"{path}: {len(lines)}") + +if __name__ == "__main__": + main()