Fix type annotation

Improve dummy log generation script
Report more stats to Telegraf, and improve dummy info generator
2021-10-10 17:04:09 +02:00 · 2021-10-10 15:58:04 +02:00 · 2021-10-09 12:29:51 +02:00 · 2021-09-30 17:56:47 +02:00 · 2021-09-23 18:10:04 +02:00 · 2021-09-23 18:03:05 +02:00
2 changed files with 134 additions and 33 deletions
--- a/gen_log.sh
+++ b/gen_log.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+LANG=en_US.UTF-8
+now=$(date "+%d/%b/%Y:%H:%M:%S %z")
+
+max=$(( 200000 - "$(wc -l 'mucho_log_today.log' | cut -d ' ' -f 1)"))
+for (( i=0; i < $max; i++)); do
+	n=$(( $i % 256))
+	echo "10.10.100.$n - - [$now] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
+	if (( $i % 1000 == 0 )); then
+		echo $i 1>&2
+	fi
+done
--- a/get_page_stats.py
+++ b/get_page_stats.py
@ -12,8 +12,8 @@ import getpass
 import argparse
 import subprocess
 import configparser
-from datetime import datetime, time
-from collections import defaultdict
+from datetime import datetime
+from collections import defaultdict, Counter
 from itertools import repeat
 from urllib.parse import urlparse
 from typing import Dict, List, Tuple, Set
@ -22,7 +22,7 @@ import xml.etree.ElementTree as ET
 import requests

 VisitDict = Dict[str, Set[str]]
-
+MAX_UA_NB = 1000

 def parse_args()-> argparse.Namespace:
    """ Parse arguments of the script
@ -66,28 +66,43 @@ class TelegrafExporter():
        self._password = password
        self.source = source

-    def telegraf_post(self, timestamp:int, title:str, location:str, count:int)-> requests.Response:
+    def telegraf_post(self, timestamp:int, create_time: int, title:str,
+                      metric:str, count:int)-> requests.Response:
        """ Post a value to telegraf
+        :param timestamp: timestamp used by influxdb as time field.
+        :param create_time: second of the day at which the data point is exported
+                            (to de-duplicate entries generated on the same day).
+        :param title: name of the destination table in influxdb
+        :param location: path for which we register the hit count, used as a tag in influxdb.
+        :param count: hit count for the aforementioned path
        """
        payload = {"name": title,
                   "timestamp": timestamp,
+                   "create_time": create_time,
                   "source": self.source,
-                   "location": location,
+                   "location": metric,
                   "hits": count}
        return requests.post(self.telegraf_url,
                             json=payload,
                             auth=(self.username, self._password))


-    def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None:
+    def export_result_to_telegraf(self, page_hits: VisitDict,
+                                  bot_hits: VisitDict,
+                                  user_agents: Dict[str, int],
+                                  methods: Counter[str],
+                                  timestamp: int) -> None:
        """ Export the bot_hits and page_hits dictionnaries to telegraf
        """
        # export standard hits
-        timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp())
+        now = datetime.now().time()
+        create_time = now.second + 60*now.minute + 3600*now.hour
+
        name="blog_client_hit"
        for location, ips in page_hits.items():
            try:
                response = self.telegraf_post(timestamp,
+                                              create_time,
                                              name,
                                              location,
                                              len(ips))
@ -100,6 +115,7 @@ class TelegrafExporter():
        for location, ips in bot_hits.items():
            try:
                response = self.telegraf_post(timestamp,
+                                              create_time,
                                              name,
                                              location,
                                              len(ips))
@ -107,8 +123,34 @@ class TelegrafExporter():
            except requests.exceptions.RequestException as excpt:
                print(excpt)
                sys.exit(1)
+        # export user agent variety
+        name="user_agent_variety"
+        for metric_name, count in user_agents.items():
+            try:
+                response = self.telegraf_post(timestamp,
+                                              create_time,
+                                              name,
+                                              metric_name,
+                                              count)
+                response.raise_for_status()
+            except requests.exceptions.RequestException as excpt:
+                print(excpt)
+                sys.exit(1)
+        # export method variety
+        name="method_variety"
+        for metric_name, count in methods.items():
+            try:
+                response = self.telegraf_post(timestamp,
+                                              create_time,
+                                              name,
+                                              metric_name,
+                                              count)
+                response.raise_for_status()
+            except requests.exceptions.RequestException as excpt:
+                print(excpt)
+                sys.exit(1)

-def get_crawler_patterns(exclude_crawler: bool) -> List[str]:
+def get_crawler_patterns(exclude_crawler: bool) -> List[re.Pattern[str]]:
    """ Parse the crawler-user-agent file, and returns a list
    of compiled regex crawler patterns
    """
@ -150,39 +192,65 @@ def get_locations(sitemap_path:str) -> List[str]:


 def parse_logfile(logfile_path: str, locations: List[str],
-                  crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]:
-    """ Parse a logfile, and return 3 dicts:
-    page_hits, bot_hits and other_hits
+                  crawler_patterns: List[re.Pattern[str]]) -> Tuple[VisitDict, VisitDict,
+                                                                    VisitDict, Dict[str, int],
+                                                        Counter[str]]:
+    """ Parse a logfile, and return 4 dicts:
+    page_hits, bot_hits, other_hits and additional_infos
    """
+    time_local_fmt = "%d/%b/%Y:%H:%M:%S %z"
    # Regexes for all the pattern matching
-    log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
-    known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
-                                                                                  locations))))
-    other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
+    # Default format for NGINX log is:
+    # pylint: disable=line-too-long
+    # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
+    log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
+                          '"(?P<method>[A-Z]+) (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
+                          '"(?P<user_agent>.+)"$')
+    #known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
+    #                                                                              locations))))
+    log_regex = re.compile(log_line_template.format(locations='.+?'))

    # Output data structure initialization
    visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
    bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
    other_visit_dict: VisitDict = defaultdict(set)
+    bot_user_agents: Set[str] = set()
+    client_user_agents: Set[str] = set()
+    method_counter: Counter[str] = Counter()
+    # The way to get the timezone data here is not great (not taking into account DST and such)
+    # but it is a fallback default date that should hardly ever be used.
+    last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)

+    # Do not parse a log file that has not been edited since more than 24 hours
+    if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
+        print("Log file is too old, there was no access today.")
+        logfile_path="/dev/null"
    with open(logfile_path, 'r', encoding='utf-8') as logfile:
        for line in logfile:
-            match_obj = re.match(known_page_regex, line)
+            match_obj = re.match(log_regex, line)
            if match_obj:
-                # For each line, check if it is a GET on a lnown page, and count those
                client_ip = match_obj.group("ip_address")
                location = match_obj.group("location")
-                if not any(map(re.search, crawler_patterns, repeat(line))):
-                    visit_dict[location].add(client_ip)
+                last_log_date = match_obj.group("time_local")
+                user_agent = match_obj.group("user_agent")
+                method = match_obj.group("method")
+                if method == "GET" and location in locations:
+                    # For each line, if it is a GET on a known page, count it
+                    if ((not user_agent in bot_user_agents and
+                        (len(client_user_agents) + len(bot_user_agents)) < MAX_UA_NB) and
+                        (user_agent in client_user_agents or
+                        not any(map(re.search, crawler_patterns,
+                                    repeat(user_agent))))):
+                        visit_dict[location].add(client_ip)
+                        client_user_agents.add(user_agent)
+                    else:
+                        bot_visit_dict[location].add(client_ip)
+                        bot_user_agents.add(user_agent)
                else:
-                    bot_visit_dict[location].add(client_ip)
-            else:
-                # Also count lines that are NOT GET on a known page in a different dict.
-                match_obj = re.match(other_pages_regex, line)
-                if match_obj:
-                    client_ip = match_obj.group("ip_address")
-                    location = match_obj.group("location")
-                    # Those other hits are either ressource loaded, in this case we group the hits
+                    # Also count lines that are NOT "GET on a known page" in a different dict.
+                    # Those other hits can be static site ressources loaded,
+                    # in which case we group the hits
+                    method_counter[method] += 1
                    if location.startswith("/isso/"):
                        other_visit_dict["/isso/*"].add(client_ip)
                    elif location.startswith("/assets/css/"):
@ -192,9 +260,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
                    elif location.startswith("/images/"):
                        other_visit_dict["/images/*"].add(client_ip)
                    else:
-                        # for everything else, we store the exact path
+                        # for everything else, we store the exact path, but not the query string
                        other_visit_dict[location.split('?')[0]].add(client_ip)
-    return visit_dict, bot_visit_dict, other_visit_dict
+    today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
+                                                                          minute=0,
+                                                                          second=0,
+                                                                          microsecond=0)
+    additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
+                        "bot_user_agents_nb": len(bot_user_agents),
+                        "client_user_agents_nb": len(client_user_agents)}
+
+
+    return visit_dict, bot_visit_dict, other_visit_dict, additional_infos, method_counter


 def main() -> None:
@ -226,22 +303,34 @@ def main() -> None:
    # Get parser, get locations and parse the log file
    crawler_patterns = get_crawler_patterns(args.exclude_crawler)
    locations = get_locations(args.sitemap)
-    visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile,
-                                                                 locations,
-                                                                 crawler_patterns)
+    (visit_dict, bot_visit_dict, other_visit_dict,
+        additional_infos, method_counter) = parse_logfile(args.logfile,
+                                                          locations,
+                                                          crawler_patterns)

    # Generate the report
    print_visit_dict("Standard visits", visit_dict)
+    print(f"There were {additional_infos['client_user_agents_nb']} unique client user agent(s)")
    if args.exclude_crawler:
        print_visit_dict("Bot visits", bot_visit_dict)
+        print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
    print_visit_dict("Other visits", other_visit_dict)
+    for method, count in method_counter.items():
+        print(f"{method}: {count}")

    if telegraf_url:
        exporter = TelegrafExporter(telegraf_url=telegraf_url,
                                    username=username,
                                    password=_password,
                                    source=socket.gethostname())
-        exporter.export_result_to_telegraf(visit_dict, bot_visit_dict)
+        exporter.export_result_to_telegraf(visit_dict,
+                                           bot_visit_dict,
+                                           {"bot_user_agents":
+                                                additional_infos['bot_user_agents_nb'],
+                                            "client_user_agents":
+                                                additional_infos['client_user_agents_nb']},
+                                           method_counter,
+                                           additional_infos["last_log_timestamp"])

 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Hugo	2a09c82a50	Fix type annotation	2021-10-10 17:04:09 +02:00
Hugo	68e25b3a84	Improve dummy log generation script	2021-10-10 15:58:04 +02:00
Hugo	723a82ecce	Report more stats to Telegraf, and improve dummy info generator	2021-10-09 12:29:51 +02:00
Hugo	b02f459f5b	add rubish log generator	2021-09-30 17:56:47 +02:00
Hugo	94519bfd5e	Add number of unique client user agent display	2021-09-23 18:10:04 +02:00
Hugo	750a72a477	check already existing UA before checking regex	2021-09-23 18:03:05 +02:00
Hugo	508b838bbc	reorganize the searching function	2021-09-22 22:04:09 +02:00
Hugo	6ce2cea3a1	Add some timestamp-related improvement	2021-09-22 21:48:06 +02:00
Hugo	fe019750a5	add create_time	2021-09-20 21:58:48 +02:00
Hugo	bb713a9a21	Do not scan the log file if it is older than a day old	2021-09-18 15:57:44 +02:00
saxodwarf	4a1c96574b	Merge pull request 'Make a lots of improvements' (#2 ) from first_version into master Reviewed-on: #2	2021-09-18 13:47:33 +02:00