ditch grep and make all filtering in pure python

2021-09-11 17:33:06 +02:00 · 2021-09-11 17:33:06 +02:00 · 3e364d5f51
parent 575c7e89db
commit 3e364d5f51
2 changed files with 68 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
 # pages_stats
-Simple script to gather daily global statistics for hugo post served
+Simple script to gather daily global statistics for hugo post served.
 The crawler-user-agents.json file comes from [this project](https://github.com/monperrus/crawler-user-agents/).
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -6,6 +6,7 @@ defined in the sitemap, by unique IP.
 import re
 import json
 import argparse
 from collections import defaultdict
 from itertools import repeat
 from subprocess import run
 from urllib.parse import urlparse
@ -14,7 +15,7 @@ import xml.etree.ElementTree as ET
 def parse_args():
    parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
-    parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
+    parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml",
                        help="Path to the sitemap xml file for the website.")
    parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
                        help="Path to the log file to analyze")
@ -24,8 +25,10 @@ def parse_args():
 def main():
    """ Parses the arguments, the crawler file and the sitemap,
-    then for each locations, uses grep to select the lines containing GET calls for
+    Then reads the log file line by line, regexes through it to isolate locations and client IP
-    the location, and prints the number of unique IP accessing it.
+    It records the number of unique IP accessing each known pages (from the sitemap), and
    the number of unique IP accessing each unknown locations.
    (either ressources being loaded or bot looking for vulnerable website).
    """
    args = parse_args()
@ -49,14 +52,65 @@ def main():
    for url in root:
        locations.append(urlparse(url.find(f"{ns}loc").text).path)
-    for path in locations:
+    log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
-        # Pre-process log file using grep, to keep only interesting lines
+    known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations))))
-        cmd = ["grep", "-e", f'GET {path} ', args.logfile]
+    other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
-        process = run(cmd, capture_output=True, text=True)
+    visit_dict = dict(map(lambda x: (x, set()), locations))
-        # Silmutaneously keep only unique source IP and exclude crawlers if resquested
+    bot_visit_dict = dict(map(lambda x: (x, set()), locations))
-        lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
+    other_visits = defaultdict(set)
-        
+    with open(args.logfile, 'r') as logfile:
-        print(f"{path}: {len(lines)}")
+        for line in logfile:
            match_obj = re.match(known_page_regex, line)
            if match_obj:
                client_ip = match_obj.group("ip_address")
                location = match_obj.group("location")
                if not any(map(re.search, crawler_patterns, repeat(line))):
                    visit_dict[location].add(client_ip)
                else:
                    bot_visit_dict[location].add(client_ip)
            else:
                match_obj = re.match(other_pages_regex, line)
                if match_obj:
                    client_ip = match_obj.group("ip_address")
                    location = match_obj.group("location")
                    if location.startswith("/isso/"):
                        other_visits["/isso/*"].add(client_ip)
                    elif location.startswith("/assets/css/"):
                        other_visits["/assets/css/*"].add(client_ip)
                    elif location.startswith("/assets/js/"):
                        other_visits["/assets/js/*"].add(client_ip)
                    elif location.startswith("/images/"):
                        other_visits["/images/*"].add(client_ip)
                    else:
                        other_visits[location.split('?')[0]].add(client_ip)
    total_visits=0
    print("Standard visits:")
    for loc, ips in visit_dict.items():
        print(f"{loc}: {len(ips)}")
        total_visits += len(ips)
    print(f'Total visits: {total_visits}')
    if args.exclude_crawler:
        print("Bot visits:")
        for loc, ips in bot_visit_dict.items():
            print(f"{loc}: {len(ips)}")
    nb_other_visits = 0
    print("Other visits:")
    for loc, ips in other_visits.items():
        print(f"{loc}: {len(ips)}")
        nb_other_visits += len(ips)
    print(f'Total visits: {total_visits}')
    print(f'Other visits: {nb_other_visits}')
    #for path in locations:
    #    # Pre-process log file using grep, to keep only interesting lines
    #    cmd = ["grep", "-e", f'GET {path} ', args.logfile]
    #    process = run(cmd, capture_output=True, text=True)
    #    # Silmutaneously keep only unique source IP and exclude crawlers if resquested
    #    lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
    #    
    #    print(f"{path}: {len(lines)}")
 if __name__ == "__main__":
    main()