Add some timestamp-related improvement

This commit is contained in:
Hugo 2021-09-22 21:48:06 +02:00
parent fe019750a5
commit 6ce2cea3a1
1 changed files with 50 additions and 15 deletions

View File

@ -12,7 +12,7 @@ import getpass
import argparse
import subprocess
import configparser
from datetime import datetime, time
from datetime import datetime
from collections import defaultdict
from itertools import repeat
from urllib.parse import urlparse
@ -69,6 +69,12 @@ class TelegrafExporter():
def telegraf_post(self, timestamp:int, create_time: int, title:str,
location:str, count:int)-> requests.Response:
""" Post a value to telegraf
:param timestamp: timestamp used by influxdb as time field.
:param create_time: second of the day at which the data point is exported
(to de-duplicate entries generated on the same day).
:param title: name of the destination table in influxdb
:param location: path for which we register the hit count, used as a tag in influxdb.
:param count: hit count for the aforementioned path
"""
payload = {"name": title,
"timestamp": timestamp,
@ -81,13 +87,14 @@ class TelegrafExporter():
auth=(self.username, self._password))
def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None:
def export_result_to_telegraf(self, page_hits: VisitDict,
bot_hits: VisitDict, timestamp: int) -> None:
""" Export the bot_hits and page_hits dictionnaries to telegraf
"""
# export standard hits
timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp())
now = datetime.now().time()
create_time = now.second + 60*now.minute + 3600*now.hour
name="blog_client_hit"
for location, ips in page_hits.items():
try:
@ -156,12 +163,19 @@ def get_locations(sitemap_path:str) -> List[str]:
def parse_logfile(logfile_path: str, locations: List[str],
crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]:
""" Parse a logfile, and return 3 dicts:
page_hits, bot_hits and other_hits
crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict,
VisitDict, Dict[str, int]]:
""" Parse a logfile, and return 4 dicts:
page_hits, bot_hits, other_hits and additional_infos
"""
time_local_fmt = "%d/%b/%Y:%H:%M:%S %z"
# Regexes for all the pattern matching
log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
# Default format for NGINX log is:
# pylint: disable=line-too-long
# $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
'"(?P<user_agent>.+)"$')
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
locations))))
other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
@ -170,6 +184,12 @@ def parse_logfile(logfile_path: str, locations: List[str],
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
other_visit_dict: VisitDict = defaultdict(set)
bot_user_agents = set()
# The way to get the timezone data here is not great (not taking into account DST and such)
# but it is a fallback default date that should hardly ever be used.
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
# Do not parse a log file that has not been edited since more than 24 hours
if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
print("Log file is too old, there was no access today.")
logfile_path="/dev/null"
@ -184,13 +204,16 @@ def parse_logfile(logfile_path: str, locations: List[str],
visit_dict[location].add(client_ip)
else:
bot_visit_dict[location].add(client_ip)
bot_user_agents.add(match_obj.group("user_agent"))
last_log_date = match_obj.group("time_local")
else:
# Also count lines that are NOT GET on a known page in a different dict.
# Also count lines that are NOT "GET on a known page" in a different dict.
match_obj = re.match(other_pages_regex, line)
if match_obj:
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
# Those other hits are either ressource loaded, in this case we group the hits
# Those other hits can be static site ressources loaded,
# in which case we group the hits
if location.startswith("/isso/"):
other_visit_dict["/isso/*"].add(client_ip)
elif location.startswith("/assets/css/"):
@ -200,9 +223,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
elif location.startswith("/images/"):
other_visit_dict["/images/*"].add(client_ip)
else:
# for everything else, we store the exact path
# for everything else, we store the exact path, but not the query string
other_visit_dict[location.split('?')[0]].add(client_ip)
return visit_dict, bot_visit_dict, other_visit_dict
last_log_date = match_obj.group("time_local")
today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
minute=0,
second=0,
microsecond=0)
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
"bot_user_agents_nb": len(bot_user_agents)}
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
def main() -> None:
@ -234,14 +266,15 @@ def main() -> None:
# Get parser, get locations and parse the log file
crawler_patterns = get_crawler_patterns(args.exclude_crawler)
locations = get_locations(args.sitemap)
visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile,
locations,
crawler_patterns)
visit_dict, bot_visit_dict, other_visit_dict, additional_infos = parse_logfile(args.logfile,
locations,
crawler_patterns)
# Generate the report
print_visit_dict("Standard visits", visit_dict)
if args.exclude_crawler:
print_visit_dict("Bot visits", bot_visit_dict)
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
print_visit_dict("Other visits", other_visit_dict)
if telegraf_url:
@ -249,7 +282,9 @@ def main() -> None:
username=username,
password=_password,
source=socket.gethostname())
exporter.export_result_to_telegraf(visit_dict, bot_visit_dict)
exporter.export_result_to_telegraf(visit_dict,
bot_visit_dict,
additional_infos["last_log_timestamp"])
if __name__ == "__main__":
main()