reorganize the searching function

2021-09-22 22:04:09 +02:00 · 2021-09-22 22:04:09 +02:00 · 508b838bbc
parent 6ce2cea3a1
commit 508b838bbc
1 changed files with 14 additions and 17 deletions
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -176,9 +176,9 @@ def parse_logfile(logfile_path: str, locations: List[str],
    log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
                          '"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
                          '"(?P<user_agent>.+)"$')
-    known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
-                                                                                  locations))))
-    other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
+    #known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
+    #                                                                              locations))))
+    log_regex = re.compile(log_line_template.format(locations='.+?'))

    # Output data structure initialization
    visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
@ -195,23 +195,21 @@ def parse_logfile(logfile_path: str, locations: List[str],
        logfile_path="/dev/null"
    with open(logfile_path, 'r', encoding='utf-8') as logfile:
        for line in logfile:
-            match_obj = re.match(known_page_regex, line)
+            match_obj = re.match(log_regex, line)
            if match_obj:
-                # For each line, check if it is a GET on a lnown page, and count those
                client_ip = match_obj.group("ip_address")
                location = match_obj.group("location")
-                if not any(map(re.search, crawler_patterns, repeat(line))):
-                    visit_dict[location].add(client_ip)
-                else:
-                    bot_visit_dict[location].add(client_ip)
-                    bot_user_agents.add(match_obj.group("user_agent"))
                last_log_date = match_obj.group("time_local")
-            else:
-                # Also count lines that are NOT "GET on a known page" in a different dict.
-                match_obj = re.match(other_pages_regex, line)
-                if match_obj:
-                    client_ip = match_obj.group("ip_address")
-                    location = match_obj.group("location")
+                if location in locations:
+                    # For each line, if it is a GET on a known page, count it
+                    if not any(map(re.search, crawler_patterns,
+                                   repeat(match_obj.group("user_agent")))):
+                        visit_dict[location].add(client_ip)
+                    else:
+                        bot_visit_dict[location].add(client_ip)
+                        bot_user_agents.add(match_obj.group("user_agent"))
+                else:
+                    # Also count lines that are NOT "GET on a known page" in a different dict.
                    # Those other hits can be static site ressources loaded,
                    # in which case we group the hits
                    if location.startswith("/isso/"):
@ -225,7 +223,6 @@ def parse_logfile(logfile_path: str, locations: List[str],
                    else:
                        # for everything else, we store the exact path, but not the query string
                        other_visit_dict[location.split('?')[0]].add(client_ip)
-                    last_log_date = match_obj.group("time_local")
    today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
                                                                          minute=0,
                                                                          second=0,