check already existing UA before checking regex

2021-09-23 18:03:05 +02:00 · 2021-09-23 18:03:05 +02:00 · 750a72a477
parent 508b838bbc
commit 750a72a477
1 changed files with 8 additions and 3 deletions
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -185,6 +185,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
    bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
    other_visit_dict: VisitDict = defaultdict(set)
    bot_user_agents = set()
+    client_user_agents = set()
    # The way to get the timezone data here is not great (not taking into account DST and such)
    # but it is a fallback default date that should hardly ever be used.
    last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
@ -200,14 +201,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
                client_ip = match_obj.group("ip_address")
                location = match_obj.group("location")
                last_log_date = match_obj.group("time_local")
+                user_agent = match_obj.group("user_agent")
                if location in locations:
                    # For each line, if it is a GET on a known page, count it
-                    if not any(map(re.search, crawler_patterns,
-                                   repeat(match_obj.group("user_agent")))):
+                    if (not user_agent in bot_user_agents and
+                        user_agent in client_user_agents or
+                        not any(map(re.search, crawler_patterns,
+                                    repeat(user_agent)))):
                        visit_dict[location].add(client_ip)
+                        client_user_agents.add(user_agent)
                    else:
                        bot_visit_dict[location].add(client_ip)
-                        bot_user_agents.add(match_obj.group("user_agent"))
+                        bot_user_agents.add(user_agent)
                else:
                    # Also count lines that are NOT "GET on a known page" in a different dict.
                    # Those other hits can be static site ressources loaded,