From 723a82ecce29449d30610d332a0536b361c7a0f4 Mon Sep 17 00:00:00 2001 From: Hugo Date: Sat, 9 Oct 2021 12:29:51 +0200 Subject: [PATCH] Report more stats to Telegraf, and improve dummy info generator --- gen_log.sh | 6 ++--- get_page_stats.py | 57 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/gen_log.sh b/gen_log.sh index 02ec3ac..6e3c051 100755 --- a/gen_log.sh +++ b/gen_log.sh @@ -1,8 +1,8 @@ #!/bin/bash - -for i in {1..34000}; do - n=$( echo "$i % 256" | bc) +max=$(( 200000 - "$(wc -l 'much_log.log' | cut -d ' ' -f 1)")) +for (( i=0; i < $max; i++)); do + n=$(( $i % 256)) echo "10.10.100.$n - - [10/Sep/2021:23:17:55 +0200] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\"" if (( $i % 1000 == 0 )); then echo $i 1>&2 diff --git a/get_page_stats.py b/get_page_stats.py index d1d2e45..fdd0734 100755 --- a/get_page_stats.py +++ b/get_page_stats.py @@ -13,7 +13,7 @@ import argparse import subprocess import configparser from datetime import datetime -from collections import defaultdict +from collections import defaultdict, Counter from itertools import repeat from urllib.parse import urlparse from typing import Dict, List, Tuple, Set @@ -22,7 +22,7 @@ import xml.etree.ElementTree as ET import requests VisitDict = Dict[str, Set[str]] - +MAX_UA_NB = 1000 def parse_args()-> argparse.Namespace: """ Parse arguments of the script @@ -88,7 +88,10 @@ class TelegrafExporter(): def export_result_to_telegraf(self, page_hits: VisitDict, - bot_hits: VisitDict, timestamp: int) -> None: + bot_hits: VisitDict, + user_agents: VisitDict, + methods: Counter, + timestamp: int) -> None: """ Export the bot_hits and page_hits dictionnaries to telegraf """ # export standard hits @@ -120,6 +123,32 @@ class TelegrafExporter(): except requests.exceptions.RequestException as excpt: print(excpt) sys.exit(1) + # export user agent variety + name="user_agent_variety" + for ua_type, uas in user_agents.items(): + try: + response = self.telegraf_post(timestamp, + create_time, + name, + ua_type, + uas) + response.raise_for_status() + except requests.exceptions.RequestException as excpt: + print(excpt) + sys.exit(1) + # export method variety + name="method_variety" + for method, count in methods.items(): + try: + response = self.telegraf_post(timestamp, + create_time, + name, + method, + count) + response.raise_for_status() + except requests.exceptions.RequestException as excpt: + print(excpt) + sys.exit(1) def get_crawler_patterns(exclude_crawler: bool) -> List[str]: """ Parse the crawler-user-agent file, and returns a list @@ -174,7 +203,7 @@ def parse_logfile(logfile_path: str, locations: List[str], # pylint: disable=line-too-long # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" log_line_template = (r'^(?P[0-9a-f.:]+) \- .+? \[(?P.*)\] ' - '"GET (?P{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" ' + '"(?P[A-Z]+) (?P{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" ' '"(?P.+)"$') #known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, # locations)))) @@ -186,6 +215,7 @@ def parse_logfile(logfile_path: str, locations: List[str], other_visit_dict: VisitDict = defaultdict(set) bot_user_agents = set() client_user_agents = set() + method_counter = Counter() # The way to get the timezone data here is not great (not taking into account DST and such) # but it is a fallback default date that should hardly ever be used. last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt) @@ -202,12 +232,14 @@ def parse_logfile(logfile_path: str, locations: List[str], location = match_obj.group("location") last_log_date = match_obj.group("time_local") user_agent = match_obj.group("user_agent") - if location in locations: + method = match_obj.group("method") + if method == "GET" and location in locations: # For each line, if it is a GET on a known page, count it - if (not user_agent in bot_user_agents and - user_agent in client_user_agents or + if ((not user_agent in bot_user_agents and + (len(client_user_agents) + len(bot_user_agents)) < MAX_UA_NB) and + (user_agent in client_user_agents or not any(map(re.search, crawler_patterns, - repeat(user_agent)))): + repeat(user_agent))))): visit_dict[location].add(client_ip) client_user_agents.add(user_agent) else: @@ -217,6 +249,7 @@ def parse_logfile(logfile_path: str, locations: List[str], # Also count lines that are NOT "GET on a known page" in a different dict. # Those other hits can be static site ressources loaded, # in which case we group the hits + method_counter[method] += 1 if location.startswith("/isso/"): other_visit_dict["/isso/*"].add(client_ip) elif location.startswith("/assets/css/"): @@ -234,7 +267,8 @@ def parse_logfile(logfile_path: str, locations: List[str], microsecond=0) additional_infos = {"last_log_timestamp": int(today_date.timestamp()), "bot_user_agents_nb": len(bot_user_agents), - "client_user_agents_nb": len(client_user_agents)} + "client_user_agents_nb": len(client_user_agents), + "methods": method_counter} return visit_dict, bot_visit_dict, other_visit_dict, additional_infos @@ -280,6 +314,8 @@ def main() -> None: print_visit_dict("Bot visits", bot_visit_dict) print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)") print_visit_dict("Other visits", other_visit_dict) + for method, count in additional_infos["methods"].items(): + print(f"{method}: {count}") if telegraf_url: exporter = TelegrafExporter(telegraf_url=telegraf_url, @@ -288,6 +324,9 @@ def main() -> None: source=socket.gethostname()) exporter.export_result_to_telegraf(visit_dict, bot_visit_dict, + {"bot_user_agents":additional_infos['bot_user_agents_nb'], + "client_user_agents": additional_infos['client_user_agents_nb']}, + additional_infos["methods"], additional_infos["last_log_timestamp"]) if __name__ == "__main__":