From 6ce2cea3a18367afc942a3f4fc49e32a0386777b Mon Sep 17 00:00:00 2001 From: Hugo Date: Wed, 22 Sep 2021 21:48:06 +0200 Subject: [PATCH] Add some timestamp-related improvement --- get_page_stats.py | 65 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/get_page_stats.py b/get_page_stats.py index 7648258..bb6de55 100755 --- a/get_page_stats.py +++ b/get_page_stats.py @@ -12,7 +12,7 @@ import getpass import argparse import subprocess import configparser -from datetime import datetime, time +from datetime import datetime from collections import defaultdict from itertools import repeat from urllib.parse import urlparse @@ -69,6 +69,12 @@ class TelegrafExporter(): def telegraf_post(self, timestamp:int, create_time: int, title:str, location:str, count:int)-> requests.Response: """ Post a value to telegraf + :param timestamp: timestamp used by influxdb as time field. + :param create_time: second of the day at which the data point is exported + (to de-duplicate entries generated on the same day). + :param title: name of the destination table in influxdb + :param location: path for which we register the hit count, used as a tag in influxdb. + :param count: hit count for the aforementioned path """ payload = {"name": title, "timestamp": timestamp, @@ -81,13 +87,14 @@ class TelegrafExporter(): auth=(self.username, self._password)) - def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None: + def export_result_to_telegraf(self, page_hits: VisitDict, + bot_hits: VisitDict, timestamp: int) -> None: """ Export the bot_hits and page_hits dictionnaries to telegraf """ # export standard hits - timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp()) now = datetime.now().time() create_time = now.second + 60*now.minute + 3600*now.hour + name="blog_client_hit" for location, ips in page_hits.items(): try: @@ -156,12 +163,19 @@ def get_locations(sitemap_path:str) -> List[str]: def parse_logfile(logfile_path: str, locations: List[str], - crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]: - """ Parse a logfile, and return 3 dicts: - page_hits, bot_hits and other_hits + crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, + VisitDict, Dict[str, int]]: + """ Parse a logfile, and return 4 dicts: + page_hits, bot_hits, other_hits and additional_infos """ + time_local_fmt = "%d/%b/%Y:%H:%M:%S %z" # Regexes for all the pattern matching - log_line_template = r'^(?P[0-9a-f.:]+) .*"GET (?P{locations}) .*' + # Default format for NGINX log is: + # pylint: disable=line-too-long + # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" + log_line_template = (r'^(?P[0-9a-f.:]+) \- .+? \[(?P.*)\] ' + '"GET (?P{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" ' + '"(?P.+)"$') known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations)))) other_pages_regex = re.compile(log_line_template.format(locations='.+?')) @@ -170,6 +184,12 @@ def parse_logfile(logfile_path: str, locations: List[str], visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) other_visit_dict: VisitDict = defaultdict(set) + bot_user_agents = set() + # The way to get the timezone data here is not great (not taking into account DST and such) + # but it is a fallback default date that should hardly ever be used. + last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt) + + # Do not parse a log file that has not been edited since more than 24 hours if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600: print("Log file is too old, there was no access today.") logfile_path="/dev/null" @@ -184,13 +204,16 @@ def parse_logfile(logfile_path: str, locations: List[str], visit_dict[location].add(client_ip) else: bot_visit_dict[location].add(client_ip) + bot_user_agents.add(match_obj.group("user_agent")) + last_log_date = match_obj.group("time_local") else: - # Also count lines that are NOT GET on a known page in a different dict. + # Also count lines that are NOT "GET on a known page" in a different dict. match_obj = re.match(other_pages_regex, line) if match_obj: client_ip = match_obj.group("ip_address") location = match_obj.group("location") - # Those other hits are either ressource loaded, in this case we group the hits + # Those other hits can be static site ressources loaded, + # in which case we group the hits if location.startswith("/isso/"): other_visit_dict["/isso/*"].add(client_ip) elif location.startswith("/assets/css/"): @@ -200,9 +223,18 @@ def parse_logfile(logfile_path: str, locations: List[str], elif location.startswith("/images/"): other_visit_dict["/images/*"].add(client_ip) else: - # for everything else, we store the exact path + # for everything else, we store the exact path, but not the query string other_visit_dict[location.split('?')[0]].add(client_ip) - return visit_dict, bot_visit_dict, other_visit_dict + last_log_date = match_obj.group("time_local") + today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0, + minute=0, + second=0, + microsecond=0) + additional_infos = {"last_log_timestamp": int(today_date.timestamp()), + "bot_user_agents_nb": len(bot_user_agents)} + + + return visit_dict, bot_visit_dict, other_visit_dict, additional_infos def main() -> None: @@ -234,14 +266,15 @@ def main() -> None: # Get parser, get locations and parse the log file crawler_patterns = get_crawler_patterns(args.exclude_crawler) locations = get_locations(args.sitemap) - visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile, - locations, - crawler_patterns) + visit_dict, bot_visit_dict, other_visit_dict, additional_infos = parse_logfile(args.logfile, + locations, + crawler_patterns) # Generate the report print_visit_dict("Standard visits", visit_dict) if args.exclude_crawler: print_visit_dict("Bot visits", bot_visit_dict) + print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)") print_visit_dict("Other visits", other_visit_dict) if telegraf_url: @@ -249,7 +282,9 @@ def main() -> None: username=username, password=_password, source=socket.gethostname()) - exporter.export_result_to_telegraf(visit_dict, bot_visit_dict) + exporter.export_result_to_telegraf(visit_dict, + bot_visit_dict, + additional_infos["last_log_timestamp"]) if __name__ == "__main__": main()