From 508b838bbc1d697365baf231a1cf089e1a744a20 Mon Sep 17 00:00:00 2001 From: Hugo Date: Wed, 22 Sep 2021 22:04:09 +0200 Subject: [PATCH] reorganize the searching function --- get_page_stats.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/get_page_stats.py b/get_page_stats.py index bb6de55..0db7ff4 100755 --- a/get_page_stats.py +++ b/get_page_stats.py @@ -176,9 +176,9 @@ def parse_logfile(logfile_path: str, locations: List[str], log_line_template = (r'^(?P[0-9a-f.:]+) \- .+? \[(?P.*)\] ' '"GET (?P{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" ' '"(?P.+)"$') - known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, - locations)))) - other_pages_regex = re.compile(log_line_template.format(locations='.+?')) + #known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, + # locations)))) + log_regex = re.compile(log_line_template.format(locations='.+?')) # Output data structure initialization visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) @@ -195,23 +195,21 @@ def parse_logfile(logfile_path: str, locations: List[str], logfile_path="/dev/null" with open(logfile_path, 'r', encoding='utf-8') as logfile: for line in logfile: - match_obj = re.match(known_page_regex, line) + match_obj = re.match(log_regex, line) if match_obj: - # For each line, check if it is a GET on a lnown page, and count those client_ip = match_obj.group("ip_address") location = match_obj.group("location") - if not any(map(re.search, crawler_patterns, repeat(line))): - visit_dict[location].add(client_ip) - else: - bot_visit_dict[location].add(client_ip) - bot_user_agents.add(match_obj.group("user_agent")) last_log_date = match_obj.group("time_local") - else: - # Also count lines that are NOT "GET on a known page" in a different dict. - match_obj = re.match(other_pages_regex, line) - if match_obj: - client_ip = match_obj.group("ip_address") - location = match_obj.group("location") + if location in locations: + # For each line, if it is a GET on a known page, count it + if not any(map(re.search, crawler_patterns, + repeat(match_obj.group("user_agent")))): + visit_dict[location].add(client_ip) + else: + bot_visit_dict[location].add(client_ip) + bot_user_agents.add(match_obj.group("user_agent")) + else: + # Also count lines that are NOT "GET on a known page" in a different dict. # Those other hits can be static site ressources loaded, # in which case we group the hits if location.startswith("/isso/"): @@ -225,7 +223,6 @@ def parse_logfile(logfile_path: str, locations: List[str], else: # for everything else, we store the exact path, but not the query string other_visit_dict[location.split('?')[0]].add(client_ip) - last_log_date = match_obj.group("time_local") today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0, minute=0, second=0,