check already existing UA before checking regex

This commit is contained in:
Hugo 2021-09-23 18:03:05 +02:00
parent 508b838bbc
commit 750a72a477
1 changed files with 8 additions and 3 deletions

View File

@ -185,6 +185,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
other_visit_dict: VisitDict = defaultdict(set)
bot_user_agents = set()
client_user_agents = set()
# The way to get the timezone data here is not great (not taking into account DST and such)
# but it is a fallback default date that should hardly ever be used.
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
@ -200,14 +201,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
last_log_date = match_obj.group("time_local")
user_agent = match_obj.group("user_agent")
if location in locations:
# For each line, if it is a GET on a known page, count it
if not any(map(re.search, crawler_patterns,
repeat(match_obj.group("user_agent")))):
if (not user_agent in bot_user_agents and
user_agent in client_user_agents or
not any(map(re.search, crawler_patterns,
repeat(user_agent)))):
visit_dict[location].add(client_ip)
client_user_agents.add(user_agent)
else:
bot_visit_dict[location].add(client_ip)
bot_user_agents.add(match_obj.group("user_agent"))
bot_user_agents.add(user_agent)
else:
# Also count lines that are NOT "GET on a known page" in a different dict.
# Those other hits can be static site ressources loaded,