check already existing UA before checking regex
This commit is contained in:
parent
508b838bbc
commit
750a72a477
|
|
@ -185,6 +185,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
|||
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
||||
other_visit_dict: VisitDict = defaultdict(set)
|
||||
bot_user_agents = set()
|
||||
client_user_agents = set()
|
||||
# The way to get the timezone data here is not great (not taking into account DST and such)
|
||||
# but it is a fallback default date that should hardly ever be used.
|
||||
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
|
||||
|
|
@ -200,14 +201,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
|||
client_ip = match_obj.group("ip_address")
|
||||
location = match_obj.group("location")
|
||||
last_log_date = match_obj.group("time_local")
|
||||
user_agent = match_obj.group("user_agent")
|
||||
if location in locations:
|
||||
# For each line, if it is a GET on a known page, count it
|
||||
if not any(map(re.search, crawler_patterns,
|
||||
repeat(match_obj.group("user_agent")))):
|
||||
if (not user_agent in bot_user_agents and
|
||||
user_agent in client_user_agents or
|
||||
not any(map(re.search, crawler_patterns,
|
||||
repeat(user_agent)))):
|
||||
visit_dict[location].add(client_ip)
|
||||
client_user_agents.add(user_agent)
|
||||
else:
|
||||
bot_visit_dict[location].add(client_ip)
|
||||
bot_user_agents.add(match_obj.group("user_agent"))
|
||||
bot_user_agents.add(user_agent)
|
||||
else:
|
||||
# Also count lines that are NOT "GET on a known page" in a different dict.
|
||||
# Those other hits can be static site ressources loaded,
|
||||
|
|
|
|||
Loading…
Reference in New Issue