Improve dummy log generation script

Report more stats to Telegraf, and improve dummy info generator
2021-10-10 15:58:04 +02:00 · 2021-10-09 12:29:51 +02:00
2 changed files with 54 additions and 13 deletions
--- a/gen_log.sh
+++ b/gen_log.sh
@ -1,9 +1,11 @@
 #!/bin/bash
 LANG=en_US.UTF-8
 now=$(date "+%d/%b/%Y:%H:%M:%S %z")
-
+max=$(( 200000 - "$(wc -l 'mucho_log_today.log' | cut -d ' ' -f 1)"))
-for i in {1..34000}; do
+for (( i=0; i < $max; i++)); do
-	n=$( echo "$i % 256" | bc)
+	n=$(( $i % 256))
-	echo "10.10.100.$n - - [10/Sep/2021:23:17:55 +0200] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
+	echo "10.10.100.$n - - [$now] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
 	if (( $i % 1000 == 0 )); then
 		echo $i 1>&2
 	fi
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -13,7 +13,7 @@ import argparse
 import subprocess
 import configparser
 from datetime import datetime
-from collections import defaultdict
+from collections import defaultdict, Counter
 from itertools import repeat
 from urllib.parse import urlparse
 from typing import Dict, List, Tuple, Set
@ -22,7 +22,7 @@ import xml.etree.ElementTree as ET
 import requests
 VisitDict = Dict[str, Set[str]]
-
+MAX_UA_NB = 1000
 def parse_args()-> argparse.Namespace:
    """ Parse arguments of the script
@ -88,7 +88,10 @@ class TelegrafExporter():
    def export_result_to_telegraf(self, page_hits: VisitDict,
-                                  bot_hits: VisitDict, timestamp: int) -> None:
+                                  bot_hits: VisitDict,
                                  user_agents: VisitDict,
                                  methods: Counter,
                                  timestamp: int) -> None:
        """ Export the bot_hits and page_hits dictionnaries to telegraf
        """
        # export standard hits
@ -120,6 +123,32 @@ class TelegrafExporter():
            except requests.exceptions.RequestException as excpt:
                print(excpt)
                sys.exit(1)
        # export user agent variety
        name="user_agent_variety"
        for ua_type, uas in user_agents.items():
            try:
                response = self.telegraf_post(timestamp,
                                              create_time,
                                              name,
                                              ua_type,
                                              uas)
                response.raise_for_status()
            except requests.exceptions.RequestException as excpt:
                print(excpt)
                sys.exit(1)
        # export method variety
        name="method_variety"
        for method, count in methods.items():
            try:
                response = self.telegraf_post(timestamp,
                                              create_time,
                                              name,
                                              method,
                                              count)
                response.raise_for_status()
            except requests.exceptions.RequestException as excpt:
                print(excpt)
                sys.exit(1)
 def get_crawler_patterns(exclude_crawler: bool) -> List[str]:
    """ Parse the crawler-user-agent file, and returns a list
@ -174,7 +203,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
    # pylint: disable=line-too-long
    # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
    log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
-                          '"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
+                          '"(?P<method>[A-Z]+) (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
                          '"(?P<user_agent>.+)"$')
    #known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
    #                                                                              locations))))
@ -186,6 +215,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
    other_visit_dict: VisitDict = defaultdict(set)
    bot_user_agents = set()
    client_user_agents = set()
    method_counter = Counter()
    # The way to get the timezone data here is not great (not taking into account DST and such)
    # but it is a fallback default date that should hardly ever be used.
    last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
@ -202,12 +232,14 @@ def parse_logfile(logfile_path: str, locations: List[str],
                location = match_obj.group("location")
                last_log_date = match_obj.group("time_local")
                user_agent = match_obj.group("user_agent")
-                if location in locations:
+                method = match_obj.group("method")
                if method == "GET" and location in locations:
                    # For each line, if it is a GET on a known page, count it
-                    if (not user_agent in bot_user_agents and
+                    if ((not user_agent in bot_user_agents and
-                        user_agent in client_user_agents or
+                        (len(client_user_agents) + len(bot_user_agents)) < MAX_UA_NB) and
                        (user_agent in client_user_agents or
                        not any(map(re.search, crawler_patterns,
-                                    repeat(user_agent)))):
+                                    repeat(user_agent))))):
                        visit_dict[location].add(client_ip)
                        client_user_agents.add(user_agent)
                    else:
@ -217,6 +249,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
                    # Also count lines that are NOT "GET on a known page" in a different dict.
                    # Those other hits can be static site ressources loaded,
                    # in which case we group the hits
                    method_counter[method] += 1
                    if location.startswith("/isso/"):
                        other_visit_dict["/isso/*"].add(client_ip)
                    elif location.startswith("/assets/css/"):
@ -234,7 +267,8 @@ def parse_logfile(logfile_path: str, locations: List[str],
                                                                          microsecond=0)
    additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
                        "bot_user_agents_nb": len(bot_user_agents),
-                        "client_user_agents_nb": len(client_user_agents)}
+                        "client_user_agents_nb": len(client_user_agents),
                        "methods": method_counter}
    return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
@ -280,6 +314,8 @@ def main() -> None:
        print_visit_dict("Bot visits", bot_visit_dict)
        print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
    print_visit_dict("Other visits", other_visit_dict)
    for method, count in additional_infos["methods"].items():
        print(f"{method}: {count}")
    if telegraf_url:
        exporter = TelegrafExporter(telegraf_url=telegraf_url,
@ -288,6 +324,9 @@ def main() -> None:
                                    source=socket.gethostname())
        exporter.export_result_to_telegraf(visit_dict,
                                           bot_visit_dict,
                                           {"bot_user_agents":additional_infos['bot_user_agents_nb'],
                                            "client_user_agents": additional_infos['client_user_agents_nb']},
                                           additional_infos["methods"],
                                           additional_infos["last_log_timestamp"])
 if __name__ == "__main__":
Author	SHA1	Message	Date
Hugo	68e25b3a84	Improve dummy log generation script	2021-10-10 15:58:04 +02:00
Hugo	723a82ecce	Report more stats to Telegraf, and improve dummy info generator	2021-10-09 12:29:51 +02:00