From 750a72a477a801873974f87058d31ea09c4b548e Mon Sep 17 00:00:00 2001
From: Hugo <saxodwarf@saxodwarf.fr>
Date: Thu, 23 Sep 2021 18:03:05 +0200
Subject: [PATCH] check already existing UA before checking regex

---
 get_page_stats.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/get_page_stats.py b/get_page_stats.py
index 0db7ff4..a9b6a6e 100755
--- a/get_page_stats.py
+++ b/get_page_stats.py
@@ -185,6 +185,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
     bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
     other_visit_dict: VisitDict = defaultdict(set)
     bot_user_agents = set()
+    client_user_agents = set()
     # The way to get the timezone data here is not great (not taking into account DST and such)
     # but it is a fallback default date that should hardly ever be used.
     last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
@@ -200,14 +201,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
                 client_ip = match_obj.group("ip_address")
                 location = match_obj.group("location")
                 last_log_date = match_obj.group("time_local")
+                user_agent = match_obj.group("user_agent")
                 if location in locations:
                     # For each line, if it is a GET on a known page, count it
-                    if not any(map(re.search, crawler_patterns,
-                                   repeat(match_obj.group("user_agent")))):
+                    if (not user_agent in bot_user_agents and
+                        user_agent in client_user_agents or
+                        not any(map(re.search, crawler_patterns,
+                                    repeat(user_agent)))):
                         visit_dict[location].add(client_ip)
+                        client_user_agents.add(user_agent)
                     else:
                         bot_visit_dict[location].add(client_ip)
-                        bot_user_agents.add(match_obj.group("user_agent"))
+                        bot_user_agents.add(user_agent)
                 else:
                     # Also count lines that are NOT "GET on a known page" in a different dict.
                     # Those other hits can be static site ressources loaded,