From 6ce2cea3a18367afc942a3f4fc49e32a0386777b Mon Sep 17 00:00:00 2001
From: Hugo <saxodwarf@saxodwarf.fr>
Date: Wed, 22 Sep 2021 21:48:06 +0200
Subject: [PATCH] Add some timestamp-related improvement

---
 get_page_stats.py | 65 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 15 deletions(-)
diff --git a/get_page_stats.py b/get_page_stats.py
index 7648258..bb6de55 100755
--- a/get_page_stats.py
+++ b/get_page_stats.py
@@ -12,7 +12,7 @@ import getpass
 import argparse
 import subprocess
 import configparser
-from datetime import datetime, time
+from datetime import datetime
 from collections import defaultdict
 from itertools import repeat
 from urllib.parse import urlparse
@@ -69,6 +69,12 @@ class TelegrafExporter():
     def telegraf_post(self, timestamp:int, create_time: int, title:str,
                       location:str, count:int)-> requests.Response:
         """ Post a value to telegraf
+        :param timestamp: timestamp used by influxdb as time field.
+        :param create_time: second of the day at which the data point is exported
+                            (to de-duplicate entries generated on the same day).
+        :param title: name of the destination table in influxdb
+        :param location: path for which we register the hit count, used as a tag in influxdb.
+        :param count: hit count for the aforementioned path
         """
         payload = {"name": title,
                    "timestamp": timestamp,
@@ -81,13 +87,14 @@ class TelegrafExporter():
                              auth=(self.username, self._password))
 
 
-    def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None:
+    def export_result_to_telegraf(self, page_hits: VisitDict,
+                                  bot_hits: VisitDict, timestamp: int) -> None:
         """ Export the bot_hits and page_hits dictionnaries to telegraf
         """
         # export standard hits
-        timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp())
         now = datetime.now().time()
         create_time = now.second + 60*now.minute + 3600*now.hour
+
         name="blog_client_hit"
         for location, ips in page_hits.items():
             try:
@@ -156,12 +163,19 @@ def get_locations(sitemap_path:str) -> List[str]:
 
 
 def parse_logfile(logfile_path: str, locations: List[str],
-                  crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]:
-    """ Parse a logfile, and return 3 dicts:
-    page_hits, bot_hits and other_hits
+                  crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict,
+                                                        VisitDict, Dict[str, int]]:
+    """ Parse a logfile, and return 4 dicts:
+    page_hits, bot_hits, other_hits and additional_infos
     """
+    time_local_fmt = "%d/%b/%Y:%H:%M:%S %z"
     # Regexes for all the pattern matching
-    log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
+    # Default format for NGINX log is:
+    # pylint: disable=line-too-long
+    # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
+    log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
+                          '"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
+                          '"(?P<user_agent>.+)"$')
     known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
                                                                                   locations))))
     other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
@@ -170,6 +184,12 @@ def parse_logfile(logfile_path: str, locations: List[str],
     visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
     bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
     other_visit_dict: VisitDict = defaultdict(set)
+    bot_user_agents = set()
+    # The way to get the timezone data here is not great (not taking into account DST and such)
+    # but it is a fallback default date that should hardly ever be used.
+    last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
+
+    # Do not parse a log file that has not been edited since more than 24 hours
     if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
         print("Log file is too old, there was no access today.")
         logfile_path="/dev/null"
@@ -184,13 +204,16 @@ def parse_logfile(logfile_path: str, locations: List[str],
                     visit_dict[location].add(client_ip)
                 else:
                     bot_visit_dict[location].add(client_ip)
+                    bot_user_agents.add(match_obj.group("user_agent"))
+                last_log_date = match_obj.group("time_local")
             else:
-                # Also count lines that are NOT GET on a known page in a different dict.
+                # Also count lines that are NOT "GET on a known page" in a different dict.
                 match_obj = re.match(other_pages_regex, line)
                 if match_obj:
                     client_ip = match_obj.group("ip_address")
                     location = match_obj.group("location")
-                    # Those other hits are either ressource loaded, in this case we group the hits
+                    # Those other hits can be static site ressources loaded,
+                    # in which case we group the hits
                     if location.startswith("/isso/"):
                         other_visit_dict["/isso/*"].add(client_ip)
                     elif location.startswith("/assets/css/"):
@@ -200,9 +223,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
                     elif location.startswith("/images/"):
                         other_visit_dict["/images/*"].add(client_ip)
                     else:
-                        # for everything else, we store the exact path
+                        # for everything else, we store the exact path, but not the query string
                         other_visit_dict[location.split('?')[0]].add(client_ip)
-    return visit_dict, bot_visit_dict, other_visit_dict
+                    last_log_date = match_obj.group("time_local")
+    today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
+                                                                          minute=0,
+                                                                          second=0,
+                                                                          microsecond=0)
+    additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
+                        "bot_user_agents_nb": len(bot_user_agents)}
+
+
+    return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
 
 
 def main() -> None:
@@ -234,14 +266,15 @@ def main() -> None:
     # Get parser, get locations and parse the log file
     crawler_patterns = get_crawler_patterns(args.exclude_crawler)
     locations = get_locations(args.sitemap)
-    visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile,
-                                                                 locations,
-                                                                 crawler_patterns)
+    visit_dict, bot_visit_dict, other_visit_dict, additional_infos = parse_logfile(args.logfile,
+                                                                                   locations,
+                                                                                   crawler_patterns)
 
     # Generate the report
     print_visit_dict("Standard visits", visit_dict)
     if args.exclude_crawler:
         print_visit_dict("Bot visits", bot_visit_dict)
+        print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
     print_visit_dict("Other visits", other_visit_dict)
 
     if telegraf_url:
@@ -249,7 +282,9 @@ def main() -> None:
                                     username=username,
                                     password=_password,
                                     source=socket.gethostname())
-        exporter.export_result_to_telegraf(visit_dict, bot_visit_dict)
+        exporter.export_result_to_telegraf(visit_dict,
+                                           bot_visit_dict,
+                                           additional_infos["last_log_timestamp"])
 
 if __name__ == "__main__":
     main()