From 750a72a477a801873974f87058d31ea09c4b548e Mon Sep 17 00:00:00 2001 From: Hugo Date: Thu, 23 Sep 2021 18:03:05 +0200 Subject: [PATCH] check already existing UA before checking regex --- get_page_stats.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/get_page_stats.py b/get_page_stats.py index 0db7ff4..a9b6a6e 100755 --- a/get_page_stats.py +++ b/get_page_stats.py @@ -185,6 +185,7 @@ def parse_logfile(logfile_path: str, locations: List[str], bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) other_visit_dict: VisitDict = defaultdict(set) bot_user_agents = set() + client_user_agents = set() # The way to get the timezone data here is not great (not taking into account DST and such) # but it is a fallback default date that should hardly ever be used. last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt) @@ -200,14 +201,18 @@ def parse_logfile(logfile_path: str, locations: List[str], client_ip = match_obj.group("ip_address") location = match_obj.group("location") last_log_date = match_obj.group("time_local") + user_agent = match_obj.group("user_agent") if location in locations: # For each line, if it is a GET on a known page, count it - if not any(map(re.search, crawler_patterns, - repeat(match_obj.group("user_agent")))): + if (not user_agent in bot_user_agents and + user_agent in client_user_agents or + not any(map(re.search, crawler_patterns, + repeat(user_agent)))): visit_dict[location].add(client_ip) + client_user_agents.add(user_agent) else: bot_visit_dict[location].add(client_ip) - bot_user_agents.add(match_obj.group("user_agent")) + bot_user_agents.add(user_agent) else: # Also count lines that are NOT "GET on a known page" in a different dict. # Those other hits can be static site ressources loaded,