First version of the script (#1)

Création du script Co-authored-by: Hugo <saxodwarf@saxodwarf.fr> Reviewed-on: #1 Co-authored-by: saxodwarf <saxodwarf@noreply.localhost> Co-committed-by: saxodwarf <saxodwarf@noreply.localhost>
2021-09-04 16:06:39 +02:00 · 2021-09-04 16:06:39 +02:00 · c3eb31258e
parent b2058509c8
commit c3eb31258e
1 changed files with 62 additions and 0 deletions
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -0,0 +1,62 @@
+#!/bin/env python3
+""" Script to parse a sitemap.xml file,
+then look through a NGINX log file for the number of hits for each of the URLs
+defined in the sitemap, by unique IP.
+"""
+import re
+import json
+import argparse
+from itertools import repeat
+from subprocess import run
+from urllib.parse import urlparse
+import xml.etree.ElementTree as ET
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
+    parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
+                        help="Path to the sitemap xml file for the website.")
+    parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
+                        help="Path to the log file to analyze")
+    parser.add_argument("-e", "--exclude-crawler", action="store_true",
+                        help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.")
+    return parser.parse_args()
+
+def main():
+    """ Parses the arguments, the crawler file and the sitemap,
+    then for each locations, uses grep to select the lines containing GET calls for
+    the location, and prints the number of unique IP accessing it.
+    """
+    args = parse_args()
+
+    if args.exclude_crawler:
+        try:
+            with open("./crawler-user-agents.json", 'r') as crawler_file:
+                crawlers = json.load(crawler_file)
+        except (FileNotFoundError, json.JSONDecodeError):
+            print("Could not open the crawler user agent file")
+            crawlers = []
+    else:
+        crawlers = []
+    # Crawlers patterns are built once and for all for speed
+    crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers]
+
+    locations = []
+    tree = ET.parse(args.sitemap)
+    root = tree.getroot()
+    # Get the default XML namespace, needed for tag lookup later
+    ns = re.match(r'{.*}', root.tag).group(0)
+    for url in root:
+        locations.append(urlparse(url.find(f"{ns}loc").text).path)
+
+    for path in locations:
+        # Pre-process log file using grep, to keep only interesting lines
+        cmd = ["grep", "-e", f'GET {path} ', args.logfile]
+        process = run(cmd, capture_output=True, text=True)
+        # Silmutaneously keep only unique source IP and exclude crawlers if resquested
+        lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
+        
+        print(f"{path}: {len(lines)}")
+
+if __name__ == "__main__":
+    main()