From 508b838bbc1d697365baf231a1cf089e1a744a20 Mon Sep 17 00:00:00 2001
From: Hugo <saxodwarf@saxodwarf.fr>
Date: Wed, 22 Sep 2021 22:04:09 +0200
Subject: [PATCH] reorganize the searching function

---
 get_page_stats.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/get_page_stats.py b/get_page_stats.py
index bb6de55..0db7ff4 100755
--- a/get_page_stats.py
+++ b/get_page_stats.py
@@ -176,9 +176,9 @@ def parse_logfile(logfile_path: str, locations: List[str],
     log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
                           '"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
                           '"(?P<user_agent>.+)"$')
-    known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
-                                                                                  locations))))
-    other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
+    #known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
+    #                                                                              locations))))
+    log_regex = re.compile(log_line_template.format(locations='.+?'))
 
     # Output data structure initialization
     visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
@@ -195,23 +195,21 @@ def parse_logfile(logfile_path: str, locations: List[str],
         logfile_path="/dev/null"
     with open(logfile_path, 'r', encoding='utf-8') as logfile:
         for line in logfile:
-            match_obj = re.match(known_page_regex, line)
+            match_obj = re.match(log_regex, line)
             if match_obj:
-                # For each line, check if it is a GET on a lnown page, and count those
                 client_ip = match_obj.group("ip_address")
                 location = match_obj.group("location")
-                if not any(map(re.search, crawler_patterns, repeat(line))):
-                    visit_dict[location].add(client_ip)
-                else:
-                    bot_visit_dict[location].add(client_ip)
-                    bot_user_agents.add(match_obj.group("user_agent"))
                 last_log_date = match_obj.group("time_local")
-            else:
-                # Also count lines that are NOT "GET on a known page" in a different dict.
-                match_obj = re.match(other_pages_regex, line)
-                if match_obj:
-                    client_ip = match_obj.group("ip_address")
-                    location = match_obj.group("location")
+                if location in locations:
+                    # For each line, if it is a GET on a known page, count it
+                    if not any(map(re.search, crawler_patterns,
+                                   repeat(match_obj.group("user_agent")))):
+                        visit_dict[location].add(client_ip)
+                    else:
+                        bot_visit_dict[location].add(client_ip)
+                        bot_user_agents.add(match_obj.group("user_agent"))
+                else:
+                    # Also count lines that are NOT "GET on a known page" in a different dict.
                     # Those other hits can be static site ressources loaded,
                     # in which case we group the hits
                     if location.startswith("/isso/"):
@@ -225,7 +223,6 @@ def parse_logfile(logfile_path: str, locations: List[str],
                     else:
                         # for everything else, we store the exact path, but not the query string
                         other_visit_dict[location.split('?')[0]].add(client_ip)
-                    last_log_date = match_obj.group("time_local")
     today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
                                                                           minute=0,
                                                                           second=0,