ditch grep and make all filtering in pure python

2021-09-11 17:33:06 +02:00 · 2021-09-11 17:33:06 +02:00 · 3e364d5f51
parent 575c7e89db
commit 3e364d5f51
2 changed files with 68 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
 # pages_stats

-Simple script to gather daily global statistics for hugo post served
+Simple script to gather daily global statistics for hugo post served.
+
+The crawler-user-agents.json file comes from [this project](https://github.com/monperrus/crawler-user-agents/).
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -6,6 +6,7 @@ defined in the sitemap, by unique IP.
 import re
 import json
 import argparse
+from collections import defaultdict
 from itertools import repeat
 from subprocess import run
 from urllib.parse import urlparse
@ -14,7 +15,7 @@ import xml.etree.ElementTree as ET

 def parse_args():
    parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
-    parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
+    parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml",
                        help="Path to the sitemap xml file for the website.")
    parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
                        help="Path to the log file to analyze")
@ -24,8 +25,10 @@ def parse_args():

 def main():
    """ Parses the arguments, the crawler file and the sitemap,
-    then for each locations, uses grep to select the lines containing GET calls for
-    the location, and prints the number of unique IP accessing it.
+    Then reads the log file line by line, regexes through it to isolate locations and client IP
+    It records the number of unique IP accessing each known pages (from the sitemap), and
+    the number of unique IP accessing each unknown locations.
+    (either ressources being loaded or bot looking for vulnerable website).
    """
    args = parse_args()

@ -49,14 +52,65 @@ def main():
    for url in root:
        locations.append(urlparse(url.find(f"{ns}loc").text).path)

-    for path in locations:
-        # Pre-process log file using grep, to keep only interesting lines
-        cmd = ["grep", "-e", f'GET {path} ', args.logfile]
-        process = run(cmd, capture_output=True, text=True)
-        # Silmutaneously keep only unique source IP and exclude crawlers if resquested
-        lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
-        
-        print(f"{path}: {len(lines)}")
+    log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
+    known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations))))
+    other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
+    visit_dict = dict(map(lambda x: (x, set()), locations))
+    bot_visit_dict = dict(map(lambda x: (x, set()), locations))
+    other_visits = defaultdict(set)
+    with open(args.logfile, 'r') as logfile:
+        for line in logfile:
+            match_obj = re.match(known_page_regex, line)
+            if match_obj:
+                client_ip = match_obj.group("ip_address")
+                location = match_obj.group("location")
+                if not any(map(re.search, crawler_patterns, repeat(line))):
+                    visit_dict[location].add(client_ip)
+                else:
+                    bot_visit_dict[location].add(client_ip)
+            else:
+                match_obj = re.match(other_pages_regex, line)
+                if match_obj:
+                    client_ip = match_obj.group("ip_address")
+                    location = match_obj.group("location")
+                    if location.startswith("/isso/"):
+                        other_visits["/isso/*"].add(client_ip)
+                    elif location.startswith("/assets/css/"):
+                        other_visits["/assets/css/*"].add(client_ip)
+                    elif location.startswith("/assets/js/"):
+                        other_visits["/assets/js/*"].add(client_ip)
+                    elif location.startswith("/images/"):
+                        other_visits["/images/*"].add(client_ip)
+                    else:
+                        other_visits[location.split('?')[0]].add(client_ip)
+
+
+    total_visits=0
+    print("Standard visits:")
+    for loc, ips in visit_dict.items():
+        print(f"{loc}: {len(ips)}")
+        total_visits += len(ips)
+    print(f'Total visits: {total_visits}')
+    if args.exclude_crawler:
+        print("Bot visits:")
+        for loc, ips in bot_visit_dict.items():
+            print(f"{loc}: {len(ips)}")
+    nb_other_visits = 0
+    print("Other visits:")
+    for loc, ips in other_visits.items():
+        print(f"{loc}: {len(ips)}")
+        nb_other_visits += len(ips)
+    print(f'Total visits: {total_visits}')
+    print(f'Other visits: {nb_other_visits}')
+
+    #for path in locations:
+    #    # Pre-process log file using grep, to keep only interesting lines
+    #    cmd = ["grep", "-e", f'GET {path} ', args.logfile]
+    #    process = run(cmd, capture_output=True, text=True)
+    #    # Silmutaneously keep only unique source IP and exclude crawlers if resquested
+    #    lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
+    #    
+    #    print(f"{path}: {len(lines)}")

 if __name__ == "__main__":
    main()