Compare commits

...

4 Commits

Author SHA1 Message Date
Hugo 94519bfd5e Add number of unique client user agent display 2021-09-23 18:10:04 +02:00
Hugo 750a72a477 check already existing UA before checking regex 2021-09-23 18:03:05 +02:00
Hugo 508b838bbc reorganize the searching function 2021-09-22 22:04:09 +02:00
Hugo 6ce2cea3a1 Add some timestamp-related improvement 2021-09-22 21:48:06 +02:00
1 changed files with 67 additions and 28 deletions

View File

@ -12,7 +12,7 @@ import getpass
import argparse import argparse
import subprocess import subprocess
import configparser import configparser
from datetime import datetime, time from datetime import datetime
from collections import defaultdict from collections import defaultdict
from itertools import repeat from itertools import repeat
from urllib.parse import urlparse from urllib.parse import urlparse
@ -69,6 +69,12 @@ class TelegrafExporter():
def telegraf_post(self, timestamp:int, create_time: int, title:str, def telegraf_post(self, timestamp:int, create_time: int, title:str,
location:str, count:int)-> requests.Response: location:str, count:int)-> requests.Response:
""" Post a value to telegraf """ Post a value to telegraf
:param timestamp: timestamp used by influxdb as time field.
:param create_time: second of the day at which the data point is exported
(to de-duplicate entries generated on the same day).
:param title: name of the destination table in influxdb
:param location: path for which we register the hit count, used as a tag in influxdb.
:param count: hit count for the aforementioned path
""" """
payload = {"name": title, payload = {"name": title,
"timestamp": timestamp, "timestamp": timestamp,
@ -81,13 +87,14 @@ class TelegrafExporter():
auth=(self.username, self._password)) auth=(self.username, self._password))
def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None: def export_result_to_telegraf(self, page_hits: VisitDict,
bot_hits: VisitDict, timestamp: int) -> None:
""" Export the bot_hits and page_hits dictionnaries to telegraf """ Export the bot_hits and page_hits dictionnaries to telegraf
""" """
# export standard hits # export standard hits
timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp())
now = datetime.now().time() now = datetime.now().time()
create_time = now.second + 60*now.minute + 3600*now.hour create_time = now.second + 60*now.minute + 3600*now.hour
name="blog_client_hit" name="blog_client_hit"
for location, ips in page_hits.items(): for location, ips in page_hits.items():
try: try:
@ -156,41 +163,60 @@ def get_locations(sitemap_path:str) -> List[str]:
def parse_logfile(logfile_path: str, locations: List[str], def parse_logfile(logfile_path: str, locations: List[str],
crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]: crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict,
""" Parse a logfile, and return 3 dicts: VisitDict, Dict[str, int]]:
page_hits, bot_hits and other_hits """ Parse a logfile, and return 4 dicts:
page_hits, bot_hits, other_hits and additional_infos
""" """
time_local_fmt = "%d/%b/%Y:%H:%M:%S %z"
# Regexes for all the pattern matching # Regexes for all the pattern matching
log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*' # Default format for NGINX log is:
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, # pylint: disable=line-too-long
locations)))) # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
other_pages_regex = re.compile(log_line_template.format(locations='.+?')) log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
'"(?P<user_agent>.+)"$')
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
# locations))))
log_regex = re.compile(log_line_template.format(locations='.+?'))
# Output data structure initialization # Output data structure initialization
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
other_visit_dict: VisitDict = defaultdict(set) other_visit_dict: VisitDict = defaultdict(set)
bot_user_agents = set()
client_user_agents = set()
# The way to get the timezone data here is not great (not taking into account DST and such)
# but it is a fallback default date that should hardly ever be used.
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
# Do not parse a log file that has not been edited since more than 24 hours
if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600: if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
print("Log file is too old, there was no access today.") print("Log file is too old, there was no access today.")
logfile_path="/dev/null" logfile_path="/dev/null"
with open(logfile_path, 'r', encoding='utf-8') as logfile: with open(logfile_path, 'r', encoding='utf-8') as logfile:
for line in logfile: for line in logfile:
match_obj = re.match(known_page_regex, line) match_obj = re.match(log_regex, line)
if match_obj: if match_obj:
# For each line, check if it is a GET on a lnown page, and count those
client_ip = match_obj.group("ip_address") client_ip = match_obj.group("ip_address")
location = match_obj.group("location") location = match_obj.group("location")
if not any(map(re.search, crawler_patterns, repeat(line))): last_log_date = match_obj.group("time_local")
visit_dict[location].add(client_ip) user_agent = match_obj.group("user_agent")
if location in locations:
# For each line, if it is a GET on a known page, count it
if (not user_agent in bot_user_agents and
user_agent in client_user_agents or
not any(map(re.search, crawler_patterns,
repeat(user_agent)))):
visit_dict[location].add(client_ip)
client_user_agents.add(user_agent)
else:
bot_visit_dict[location].add(client_ip)
bot_user_agents.add(user_agent)
else: else:
bot_visit_dict[location].add(client_ip) # Also count lines that are NOT "GET on a known page" in a different dict.
else: # Those other hits can be static site ressources loaded,
# Also count lines that are NOT GET on a known page in a different dict. # in which case we group the hits
match_obj = re.match(other_pages_regex, line)
if match_obj:
client_ip = match_obj.group("ip_address")
location = match_obj.group("location")
# Those other hits are either ressource loaded, in this case we group the hits
if location.startswith("/isso/"): if location.startswith("/isso/"):
other_visit_dict["/isso/*"].add(client_ip) other_visit_dict["/isso/*"].add(client_ip)
elif location.startswith("/assets/css/"): elif location.startswith("/assets/css/"):
@ -200,9 +226,18 @@ def parse_logfile(logfile_path: str, locations: List[str],
elif location.startswith("/images/"): elif location.startswith("/images/"):
other_visit_dict["/images/*"].add(client_ip) other_visit_dict["/images/*"].add(client_ip)
else: else:
# for everything else, we store the exact path # for everything else, we store the exact path, but not the query string
other_visit_dict[location.split('?')[0]].add(client_ip) other_visit_dict[location.split('?')[0]].add(client_ip)
return visit_dict, bot_visit_dict, other_visit_dict today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
minute=0,
second=0,
microsecond=0)
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
"bot_user_agents_nb": len(bot_user_agents),
"client_user_agents_nb": len(client_user_agents)}
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
def main() -> None: def main() -> None:
@ -234,14 +269,16 @@ def main() -> None:
# Get parser, get locations and parse the log file # Get parser, get locations and parse the log file
crawler_patterns = get_crawler_patterns(args.exclude_crawler) crawler_patterns = get_crawler_patterns(args.exclude_crawler)
locations = get_locations(args.sitemap) locations = get_locations(args.sitemap)
visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile, visit_dict, bot_visit_dict, other_visit_dict, additional_infos = parse_logfile(args.logfile,
locations, locations,
crawler_patterns) crawler_patterns)
# Generate the report # Generate the report
print_visit_dict("Standard visits", visit_dict) print_visit_dict("Standard visits", visit_dict)
print(f"There were {additional_infos['client_user_agents_nb']} unique client user agent(s)")
if args.exclude_crawler: if args.exclude_crawler:
print_visit_dict("Bot visits", bot_visit_dict) print_visit_dict("Bot visits", bot_visit_dict)
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
print_visit_dict("Other visits", other_visit_dict) print_visit_dict("Other visits", other_visit_dict)
if telegraf_url: if telegraf_url:
@ -249,7 +286,9 @@ def main() -> None:
username=username, username=username,
password=_password, password=_password,
source=socket.gethostname()) source=socket.gethostname())
exporter.export_result_to_telegraf(visit_dict, bot_visit_dict) exporter.export_result_to_telegraf(visit_dict,
bot_visit_dict,
additional_infos["last_log_timestamp"])
if __name__ == "__main__": if __name__ == "__main__":
main() main()