Compare commits

...

11 Commits

2 changed files with 134 additions and 33 deletions

12
gen_log.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
LANG=en_US.UTF-8
now=$(date "+%d/%b/%Y:%H:%M:%S %z")
max=$(( 200000 - "$(wc -l 'mucho_log_today.log' | cut -d ' ' -f 1)"))
for (( i=0; i < $max; i++)); do
n=$(( $i % 256))
echo "10.10.100.$n - - [$now] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
if (( $i % 1000 == 0 )); then
echo $i 1>&2
fi
done

View File

@ -12,8 +12,8 @@ import getpass
import argparse
import subprocess
import configparser
from datetime import datetime, time
from collections import defaultdict
from datetime import datetime
from collections import defaultdict, Counter
from itertools import repeat
from urllib.parse import urlparse
from typing import Dict, List, Tuple, Set
@ -22,7 +22,7 @@ import xml.etree.ElementTree as ET
import requests
VisitDict = Dict[str, Set[str]]
MAX_UA_NB = 1000
def parse_args()-> argparse.Namespace:
""" Parse arguments of the script
@ -66,28 +66,43 @@ class TelegrafExporter():
self._password = password
self.source = source
def telegraf_post(self, timestamp:int, title:str, location:str, count:int)-> requests.Response:
def telegraf_post(self, timestamp:int, create_time: int, title:str,
metric:str, count:int)-> requests.Response:
""" Post a value to telegraf
:param timestamp: timestamp used by influxdb as time field.
:param create_time: second of the day at which the data point is exported
(to de-duplicate entries generated on the same day).
:param title: name of the destination table in influxdb
:param location: path for which we register the hit count, used as a tag in influxdb.
:param count: hit count for the aforementioned path
"""
payload = {"name": title,
"timestamp": timestamp,
"create_time": create_time,
"source": self.source,
"location": location,
"location": metric,
"hits": count}
return requests.post(self.telegraf_url,
json=payload,
auth=(self.username, self._password))
def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None:
def export_result_to_telegraf(self, page_hits: VisitDict,
bot_hits: VisitDict,
user_agents: Dict[str, int],
methods: Counter[str],
timestamp: int) -> None:
""" Export the bot_hits and page_hits dictionnaries to telegraf
"""
# export standard hits
timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp())
now = datetime.now().time()
create_time = now.second + 60*now.minute + 3600*now.hour
name="blog_client_hit"
for location, ips in page_hits.items():
try:
response = self.telegraf_post(timestamp,
create_time,
name,
location,
len(ips))
@ -100,6 +115,7 @@ class TelegrafExporter():
for location, ips in bot_hits.items():
try:
response = self.telegraf_post(timestamp,
create_time,
name,
location,
len(ips))
@ -107,8 +123,34 @@ class TelegrafExporter():
except requests.exceptions.RequestException as excpt:
print(excpt)
sys.exit(1)
# export user agent variety
name="user_agent_variety"
for metric_name, count in user_agents.items():
try:
response = self.telegraf_post(timestamp,
create_time,
name,
metric_name,
count)
response.raise_for_status()
except requests.exceptions.RequestException as excpt:
print(excpt)
sys.exit(1)
# export method variety
name="method_variety"
for metric_name, count in methods.items():
try:
response = self.telegraf_post(timestamp,
create_time,
name,
metric_name,
count)
response.raise_for_status()
except requests.exceptions.RequestException as excpt:
print(excpt)
sys.exit(1)
def get_crawler_patterns(exclude_crawler: bool) -> List[str]:
def get_crawler_patterns(exclude_crawler: bool) -> List[re.Pattern[str]]:
""" Parse the crawler-user-agent file, and returns a list
of compiled regex crawler patterns
"""
@ -150,39 +192,65 @@ def get_locations(sitemap_path:str) -> List[str]:
def parse_logfile(logfile_path: str, locations: List[str],
crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]:
""" Parse a logfile, and return 3 dicts:
page_hits, bot_hits and other_hits
crawler_patterns: List[re.Pattern[str]]) -> Tuple[VisitDict, VisitDict,
VisitDict, Dict[str, int],
Counter[str]]:
""" Parse a logfile, and return 4 dicts:
page_hits, bot_hits, other_hits and additional_infos
"""
time_local_fmt = "%d/%b/%Y:%H:%M:%S %z"
# Regexes for all the pattern matching
log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
locations))))
other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
# Default format for NGINX log is:
# pylint: disable=line-too-long
# $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
'"(?P<method>[A-Z]+) (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
'"(?P<user_agent>.+)"$')
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
# locations))))
log_regex = re.compile(log_line_template.format(locations='.+?'))
# Output data structure initialization
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
other_visit_dict: VisitDict = defaultdict(set)
bot_user_agents: Set[str] = set()
client_user_agents: Set[str] = set()
method_counter: Counter[str] = Counter()
# The way to get the timezone data here is not great (not taking into account DST and such)
# but it is a fallback default date that should hardly ever be used.
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
# Do not parse a log file that has not been edited since more than 24 hours
if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
print("Log file is too old, there was no access today.")
logfile_path="/dev/null"
with open(logfile_path, 'r', encoding='utf-8') as logfile:
for line in logfile:
match_obj = re.match(known_page_regex, line)
match_obj = re.match(log_regex, line)
if match_obj:
# For each line, check if it is a GET on a lnown page, and count those
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
if not any(map(re.search, crawler_patterns, repeat(line))):
visit_dict[location].add(client_ip)
last_log_date = match_obj.group("time_local")
user_agent = match_obj.group("user_agent")
method = match_obj.group("method")
if method == "GET" and location in locations:
# For each line, if it is a GET on a known page, count it
if ((not user_agent in bot_user_agents and
(len(client_user_agents) + len(bot_user_agents)) < MAX_UA_NB) and
(user_agent in client_user_agents or
not any(map(re.search, crawler_patterns,
repeat(user_agent))))):
visit_dict[location].add(client_ip)
client_user_agents.add(user_agent)
else:
bot_visit_dict[location].add(client_ip)
bot_user_agents.add(user_agent)
else:
bot_visit_dict[location].add(client_ip)
else:
# Also count lines that are NOT GET on a known page in a different dict.
match_obj = re.match(other_pages_regex, line)
if match_obj:
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
# Those other hits are either ressource loaded, in this case we group the hits
# Also count lines that are NOT "GET on a known page" in a different dict.
# Those other hits can be static site ressources loaded,
# in which case we group the hits
method_counter[method] += 1
if location.startswith("/isso/"):
other_visit_dict["/isso/*"].add(client_ip)
elif location.startswith("/assets/css/"):
@ -192,9 +260,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
elif location.startswith("/images/"):
other_visit_dict["/images/*"].add(client_ip)
else:
# for everything else, we store the exact path
# for everything else, we store the exact path, but not the query string
other_visit_dict[location.split('?')[0]].add(client_ip)
return visit_dict, bot_visit_dict, other_visit_dict
today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
minute=0,
second=0,
microsecond=0)
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
"bot_user_agents_nb": len(bot_user_agents),
"client_user_agents_nb": len(client_user_agents)}
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos, method_counter
def main() -> None:
@ -226,22 +303,34 @@ def main() -> None:
# Get parser, get locations and parse the log file
crawler_patterns = get_crawler_patterns(args.exclude_crawler)
locations = get_locations(args.sitemap)
visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile,
locations,
crawler_patterns)
(visit_dict, bot_visit_dict, other_visit_dict,
additional_infos, method_counter) = parse_logfile(args.logfile,
locations,
crawler_patterns)
# Generate the report
print_visit_dict("Standard visits", visit_dict)
print(f"There were {additional_infos['client_user_agents_nb']} unique client user agent(s)")
if args.exclude_crawler:
print_visit_dict("Bot visits", bot_visit_dict)
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
print_visit_dict("Other visits", other_visit_dict)
for method, count in method_counter.items():
print(f"{method}: {count}")
if telegraf_url:
exporter = TelegrafExporter(telegraf_url=telegraf_url,
username=username,
password=_password,
source=socket.gethostname())
exporter.export_result_to_telegraf(visit_dict, bot_visit_dict)
exporter.export_result_to_telegraf(visit_dict,
bot_visit_dict,
{"bot_user_agents":
additional_infos['bot_user_agents_nb'],
"client_user_agents":
additional_infos['client_user_agents_nb']},
method_counter,
additional_infos["last_log_timestamp"])
if __name__ == "__main__":
main()