Compare commits
2 Commits
b02f459f5b
...
68e25b3a84
| Author | SHA1 | Date |
|---|---|---|
|
|
68e25b3a84 | |
|
|
723a82ecce |
10
gen_log.sh
10
gen_log.sh
|
|
@ -1,9 +1,11 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
LANG=en_US.UTF-8
|
||||||
|
now=$(date "+%d/%b/%Y:%H:%M:%S %z")
|
||||||
|
|
||||||
|
max=$(( 200000 - "$(wc -l 'mucho_log_today.log' | cut -d ' ' -f 1)"))
|
||||||
for i in {1..34000}; do
|
for (( i=0; i < $max; i++)); do
|
||||||
n=$( echo "$i % 256" | bc)
|
n=$(( $i % 256))
|
||||||
echo "10.10.100.$n - - [10/Sep/2021:23:17:55 +0200] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
|
echo "10.10.100.$n - - [$now] \"GET /mentions-legales/ HTTP/2.0\" 301 178 \"https://saxodwarf.fr/tags/h%C3%A9bergement/\" \"Mozilla/5.0 (X11; Linux aarch64; rv:91.0) Gecko/20100101 Firefox/91.0 $i\""
|
||||||
if (( $i % 1000 == 0 )); then
|
if (( $i % 1000 == 0 )); then
|
||||||
echo $i 1>&2
|
echo $i 1>&2
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ import argparse
|
||||||
import subprocess
|
import subprocess
|
||||||
import configparser
|
import configparser
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from collections import defaultdict
|
from collections import defaultdict, Counter
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from typing import Dict, List, Tuple, Set
|
from typing import Dict, List, Tuple, Set
|
||||||
|
|
@ -22,7 +22,7 @@ import xml.etree.ElementTree as ET
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
VisitDict = Dict[str, Set[str]]
|
VisitDict = Dict[str, Set[str]]
|
||||||
|
MAX_UA_NB = 1000
|
||||||
|
|
||||||
def parse_args()-> argparse.Namespace:
|
def parse_args()-> argparse.Namespace:
|
||||||
""" Parse arguments of the script
|
""" Parse arguments of the script
|
||||||
|
|
@ -88,7 +88,10 @@ class TelegrafExporter():
|
||||||
|
|
||||||
|
|
||||||
def export_result_to_telegraf(self, page_hits: VisitDict,
|
def export_result_to_telegraf(self, page_hits: VisitDict,
|
||||||
bot_hits: VisitDict, timestamp: int) -> None:
|
bot_hits: VisitDict,
|
||||||
|
user_agents: VisitDict,
|
||||||
|
methods: Counter,
|
||||||
|
timestamp: int) -> None:
|
||||||
""" Export the bot_hits and page_hits dictionnaries to telegraf
|
""" Export the bot_hits and page_hits dictionnaries to telegraf
|
||||||
"""
|
"""
|
||||||
# export standard hits
|
# export standard hits
|
||||||
|
|
@ -120,6 +123,32 @@ class TelegrafExporter():
|
||||||
except requests.exceptions.RequestException as excpt:
|
except requests.exceptions.RequestException as excpt:
|
||||||
print(excpt)
|
print(excpt)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
# export user agent variety
|
||||||
|
name="user_agent_variety"
|
||||||
|
for ua_type, uas in user_agents.items():
|
||||||
|
try:
|
||||||
|
response = self.telegraf_post(timestamp,
|
||||||
|
create_time,
|
||||||
|
name,
|
||||||
|
ua_type,
|
||||||
|
uas)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.RequestException as excpt:
|
||||||
|
print(excpt)
|
||||||
|
sys.exit(1)
|
||||||
|
# export method variety
|
||||||
|
name="method_variety"
|
||||||
|
for method, count in methods.items():
|
||||||
|
try:
|
||||||
|
response = self.telegraf_post(timestamp,
|
||||||
|
create_time,
|
||||||
|
name,
|
||||||
|
method,
|
||||||
|
count)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.RequestException as excpt:
|
||||||
|
print(excpt)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def get_crawler_patterns(exclude_crawler: bool) -> List[str]:
|
def get_crawler_patterns(exclude_crawler: bool) -> List[str]:
|
||||||
""" Parse the crawler-user-agent file, and returns a list
|
""" Parse the crawler-user-agent file, and returns a list
|
||||||
|
|
@ -174,7 +203,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
# pylint: disable=line-too-long
|
# pylint: disable=line-too-long
|
||||||
# $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
|
# $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
|
||||||
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
|
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
|
||||||
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
|
'"(?P<method>[A-Z]+) (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
|
||||||
'"(?P<user_agent>.+)"$')
|
'"(?P<user_agent>.+)"$')
|
||||||
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
|
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
|
||||||
# locations))))
|
# locations))))
|
||||||
|
|
@ -186,6 +215,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
other_visit_dict: VisitDict = defaultdict(set)
|
other_visit_dict: VisitDict = defaultdict(set)
|
||||||
bot_user_agents = set()
|
bot_user_agents = set()
|
||||||
client_user_agents = set()
|
client_user_agents = set()
|
||||||
|
method_counter = Counter()
|
||||||
# The way to get the timezone data here is not great (not taking into account DST and such)
|
# The way to get the timezone data here is not great (not taking into account DST and such)
|
||||||
# but it is a fallback default date that should hardly ever be used.
|
# but it is a fallback default date that should hardly ever be used.
|
||||||
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
|
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
|
||||||
|
|
@ -202,12 +232,14 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
location = match_obj.group("location")
|
location = match_obj.group("location")
|
||||||
last_log_date = match_obj.group("time_local")
|
last_log_date = match_obj.group("time_local")
|
||||||
user_agent = match_obj.group("user_agent")
|
user_agent = match_obj.group("user_agent")
|
||||||
if location in locations:
|
method = match_obj.group("method")
|
||||||
|
if method == "GET" and location in locations:
|
||||||
# For each line, if it is a GET on a known page, count it
|
# For each line, if it is a GET on a known page, count it
|
||||||
if (not user_agent in bot_user_agents and
|
if ((not user_agent in bot_user_agents and
|
||||||
user_agent in client_user_agents or
|
(len(client_user_agents) + len(bot_user_agents)) < MAX_UA_NB) and
|
||||||
|
(user_agent in client_user_agents or
|
||||||
not any(map(re.search, crawler_patterns,
|
not any(map(re.search, crawler_patterns,
|
||||||
repeat(user_agent)))):
|
repeat(user_agent))))):
|
||||||
visit_dict[location].add(client_ip)
|
visit_dict[location].add(client_ip)
|
||||||
client_user_agents.add(user_agent)
|
client_user_agents.add(user_agent)
|
||||||
else:
|
else:
|
||||||
|
|
@ -217,6 +249,7 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
# Also count lines that are NOT "GET on a known page" in a different dict.
|
# Also count lines that are NOT "GET on a known page" in a different dict.
|
||||||
# Those other hits can be static site ressources loaded,
|
# Those other hits can be static site ressources loaded,
|
||||||
# in which case we group the hits
|
# in which case we group the hits
|
||||||
|
method_counter[method] += 1
|
||||||
if location.startswith("/isso/"):
|
if location.startswith("/isso/"):
|
||||||
other_visit_dict["/isso/*"].add(client_ip)
|
other_visit_dict["/isso/*"].add(client_ip)
|
||||||
elif location.startswith("/assets/css/"):
|
elif location.startswith("/assets/css/"):
|
||||||
|
|
@ -234,7 +267,8 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
microsecond=0)
|
microsecond=0)
|
||||||
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
|
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
|
||||||
"bot_user_agents_nb": len(bot_user_agents),
|
"bot_user_agents_nb": len(bot_user_agents),
|
||||||
"client_user_agents_nb": len(client_user_agents)}
|
"client_user_agents_nb": len(client_user_agents),
|
||||||
|
"methods": method_counter}
|
||||||
|
|
||||||
|
|
||||||
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
|
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
|
||||||
|
|
@ -280,6 +314,8 @@ def main() -> None:
|
||||||
print_visit_dict("Bot visits", bot_visit_dict)
|
print_visit_dict("Bot visits", bot_visit_dict)
|
||||||
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
|
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
|
||||||
print_visit_dict("Other visits", other_visit_dict)
|
print_visit_dict("Other visits", other_visit_dict)
|
||||||
|
for method, count in additional_infos["methods"].items():
|
||||||
|
print(f"{method}: {count}")
|
||||||
|
|
||||||
if telegraf_url:
|
if telegraf_url:
|
||||||
exporter = TelegrafExporter(telegraf_url=telegraf_url,
|
exporter = TelegrafExporter(telegraf_url=telegraf_url,
|
||||||
|
|
@ -288,6 +324,9 @@ def main() -> None:
|
||||||
source=socket.gethostname())
|
source=socket.gethostname())
|
||||||
exporter.export_result_to_telegraf(visit_dict,
|
exporter.export_result_to_telegraf(visit_dict,
|
||||||
bot_visit_dict,
|
bot_visit_dict,
|
||||||
|
{"bot_user_agents":additional_infos['bot_user_agents_nb'],
|
||||||
|
"client_user_agents": additional_infos['client_user_agents_nb']},
|
||||||
|
additional_infos["methods"],
|
||||||
additional_infos["last_log_timestamp"])
|
additional_infos["last_log_timestamp"])
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue