Compare commits

...

2 Commits

Author SHA1 Message Date
Hugo 68e25b3a84 Improve dummy log generation script 2021-10-10 15:58:04 +02:00
Hugo 723a82ecce Report more stats to Telegraf, and improve dummy info generator 2021-10-09 12:29:51 +02:00
2 changed files with 54 additions and 13 deletions

View File

@ -1,9 +1,11 @@
#!/bin/bash
LANG=en_US.UTF-8
now=$(date "+%d/%b/%Y:%H:%M:%S %z")
for i in {1..34000}; do
n=$( echo "$i % 256" | bc)
echo "10.10.100.$n - - [10/Sep/2021:23:17:55 +0200] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
max=$(( 200000 - "$(wc -l 'mucho_log_today.log' | cut -d ' ' -f 1)"))
for (( i=0; i < $max; i++)); do
n=$(( $i % 256))
echo "10.10.100.$n - - [$now] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
if (( $i % 1000 == 0 )); then
echo $i 1>&2
fi

View File

@ -13,7 +13,7 @@ import argparse
import subprocess
import configparser
from datetime import datetime
from collections import defaultdict
from collections import defaultdict, Counter
from itertools import repeat
from urllib.parse import urlparse
from typing import Dict, List, Tuple, Set
@ -22,7 +22,7 @@ import xml.etree.ElementTree as ET
import requests
VisitDict = Dict[str, Set[str]]
MAX_UA_NB = 1000
def parse_args()-> argparse.Namespace:
""" Parse arguments of the script
@ -88,7 +88,10 @@ class TelegrafExporter():
def export_result_to_telegraf(self, page_hits: VisitDict,
bot_hits: VisitDict, timestamp: int) -> None:
bot_hits: VisitDict,
user_agents: VisitDict,
methods: Counter,
timestamp: int) -> None:
""" Export the bot_hits and page_hits dictionnaries to telegraf
"""
# export standard hits
@ -120,6 +123,32 @@ class TelegrafExporter():
except requests.exceptions.RequestException as excpt:
print(excpt)
sys.exit(1)
# export user agent variety
name="user_agent_variety"
for ua_type, uas in user_agents.items():
try:
response = self.telegraf_post(timestamp,
create_time,
name,
ua_type,
uas)
response.raise_for_status()
except requests.exceptions.RequestException as excpt:
print(excpt)
sys.exit(1)
# export method variety
name="method_variety"
for method, count in methods.items():
try:
response = self.telegraf_post(timestamp,
create_time,
name,
method,
count)
response.raise_for_status()
except requests.exceptions.RequestException as excpt:
print(excpt)
sys.exit(1)
def get_crawler_patterns(exclude_crawler: bool) -> List[str]:
""" Parse the crawler-user-agent file, and returns a list
@ -174,7 +203,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
# pylint: disable=line-too-long
# $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
'"(?P<method>[A-Z]+) (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
'"(?P<user_agent>.+)"$')
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
# locations))))
@ -186,6 +215,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
other_visit_dict: VisitDict = defaultdict(set)
bot_user_agents = set()
client_user_agents = set()
method_counter = Counter()
# The way to get the timezone data here is not great (not taking into account DST and such)
# but it is a fallback default date that should hardly ever be used.
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
@ -202,12 +232,14 @@ def parse_logfile(logfile_path: str, locations: List[str],
location = match_obj.group("location")
last_log_date = match_obj.group("time_local")
user_agent = match_obj.group("user_agent")
if location in locations:
method = match_obj.group("method")
if method == "GET" and location in locations:
# For each line, if it is a GET on a known page, count it
if (not user_agent in bot_user_agents and
user_agent in client_user_agents or
if ((not user_agent in bot_user_agents and
(len(client_user_agents) + len(bot_user_agents)) < MAX_UA_NB) and
(user_agent in client_user_agents or
not any(map(re.search, crawler_patterns,
repeat(user_agent)))):
repeat(user_agent))))):
visit_dict[location].add(client_ip)
client_user_agents.add(user_agent)
else:
@ -217,6 +249,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
# Also count lines that are NOT "GET on a known page" in a different dict.
# Those other hits can be static site ressources loaded,
# in which case we group the hits
method_counter[method] += 1
if location.startswith("/isso/"):
other_visit_dict["/isso/*"].add(client_ip)
elif location.startswith("/assets/css/"):
@ -234,7 +267,8 @@ def parse_logfile(logfile_path: str, locations: List[str],
microsecond=0)
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
"bot_user_agents_nb": len(bot_user_agents),
"client_user_agents_nb": len(client_user_agents)}
"client_user_agents_nb": len(client_user_agents),
"methods": method_counter}
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
@ -280,6 +314,8 @@ def main() -> None:
print_visit_dict("Bot visits", bot_visit_dict)
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
print_visit_dict("Other visits", other_visit_dict)
for method, count in additional_infos["methods"].items():
print(f"{method}: {count}")
if telegraf_url:
exporter = TelegrafExporter(telegraf_url=telegraf_url,
@ -288,6 +324,9 @@ def main() -> None:
source=socket.gethostname())
exporter.export_result_to_telegraf(visit_dict,
bot_visit_dict,
{"bot_user_agents":additional_infos['bot_user_agents_nb'],
"client_user_agents": additional_infos['client_user_agents_nb']},
additional_infos["methods"],
additional_infos["last_log_timestamp"])
if __name__ == "__main__":