Compare commits
No commits in common. "94519bfd5e0bd3942aa3de7d103e2fe347cd0c2c" and "fe019750a52ae01656847ddbeba17a581daa030b" have entirely different histories.
94519bfd5e
...
fe019750a5
|
|
@ -12,7 +12,7 @@ import getpass
|
||||||
import argparse
|
import argparse
|
||||||
import subprocess
|
import subprocess
|
||||||
import configparser
|
import configparser
|
||||||
from datetime import datetime
|
from datetime import datetime, time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
@ -69,12 +69,6 @@ class TelegrafExporter():
|
||||||
def telegraf_post(self, timestamp:int, create_time: int, title:str,
|
def telegraf_post(self, timestamp:int, create_time: int, title:str,
|
||||||
location:str, count:int)-> requests.Response:
|
location:str, count:int)-> requests.Response:
|
||||||
""" Post a value to telegraf
|
""" Post a value to telegraf
|
||||||
:param timestamp: timestamp used by influxdb as time field.
|
|
||||||
:param create_time: second of the day at which the data point is exported
|
|
||||||
(to de-duplicate entries generated on the same day).
|
|
||||||
:param title: name of the destination table in influxdb
|
|
||||||
:param location: path for which we register the hit count, used as a tag in influxdb.
|
|
||||||
:param count: hit count for the aforementioned path
|
|
||||||
"""
|
"""
|
||||||
payload = {"name": title,
|
payload = {"name": title,
|
||||||
"timestamp": timestamp,
|
"timestamp": timestamp,
|
||||||
|
|
@ -87,14 +81,13 @@ class TelegrafExporter():
|
||||||
auth=(self.username, self._password))
|
auth=(self.username, self._password))
|
||||||
|
|
||||||
|
|
||||||
def export_result_to_telegraf(self, page_hits: VisitDict,
|
def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None:
|
||||||
bot_hits: VisitDict, timestamp: int) -> None:
|
|
||||||
""" Export the bot_hits and page_hits dictionnaries to telegraf
|
""" Export the bot_hits and page_hits dictionnaries to telegraf
|
||||||
"""
|
"""
|
||||||
# export standard hits
|
# export standard hits
|
||||||
|
timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp())
|
||||||
now = datetime.now().time()
|
now = datetime.now().time()
|
||||||
create_time = now.second + 60*now.minute + 3600*now.hour
|
create_time = now.second + 60*now.minute + 3600*now.hour
|
||||||
|
|
||||||
name="blog_client_hit"
|
name="blog_client_hit"
|
||||||
for location, ips in page_hits.items():
|
for location, ips in page_hits.items():
|
||||||
try:
|
try:
|
||||||
|
|
@ -163,60 +156,41 @@ def get_locations(sitemap_path:str) -> List[str]:
|
||||||
|
|
||||||
|
|
||||||
def parse_logfile(logfile_path: str, locations: List[str],
|
def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict,
|
crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]:
|
||||||
VisitDict, Dict[str, int]]:
|
""" Parse a logfile, and return 3 dicts:
|
||||||
""" Parse a logfile, and return 4 dicts:
|
page_hits, bot_hits and other_hits
|
||||||
page_hits, bot_hits, other_hits and additional_infos
|
|
||||||
"""
|
"""
|
||||||
time_local_fmt = "%d/%b/%Y:%H:%M:%S %z"
|
|
||||||
# Regexes for all the pattern matching
|
# Regexes for all the pattern matching
|
||||||
# Default format for NGINX log is:
|
log_line_template = r'^(?P<ip_address>[0-9a-f.:]+) .*"GET (?P<location>{locations}) .*'
|
||||||
# pylint: disable=line-too-long
|
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
|
||||||
# $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"
|
locations))))
|
||||||
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
|
other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
|
||||||
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
|
|
||||||
'"(?P<user_agent>.+)"$')
|
|
||||||
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
|
|
||||||
# locations))))
|
|
||||||
log_regex = re.compile(log_line_template.format(locations='.+?'))
|
|
||||||
|
|
||||||
# Output data structure initialization
|
# Output data structure initialization
|
||||||
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
||||||
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
||||||
other_visit_dict: VisitDict = defaultdict(set)
|
other_visit_dict: VisitDict = defaultdict(set)
|
||||||
bot_user_agents = set()
|
|
||||||
client_user_agents = set()
|
|
||||||
# The way to get the timezone data here is not great (not taking into account DST and such)
|
|
||||||
# but it is a fallback default date that should hardly ever be used.
|
|
||||||
last_log_date = datetime.now(datetime.now().astimezone().tzinfo).strftime(time_local_fmt)
|
|
||||||
|
|
||||||
# Do not parse a log file that has not been edited since more than 24 hours
|
|
||||||
if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
|
if abs(os.path.getmtime(logfile_path) - datetime.now().timestamp()) >= 24 * 3600:
|
||||||
print("Log file is too old, there was no access today.")
|
print("Log file is too old, there was no access today.")
|
||||||
logfile_path="/dev/null"
|
logfile_path="/dev/null"
|
||||||
with open(logfile_path, 'r', encoding='utf-8') as logfile:
|
with open(logfile_path, 'r', encoding='utf-8') as logfile:
|
||||||
for line in logfile:
|
for line in logfile:
|
||||||
match_obj = re.match(log_regex, line)
|
match_obj = re.match(known_page_regex, line)
|
||||||
|
if match_obj:
|
||||||
|
# For each line, check if it is a GET on a lnown page, and count those
|
||||||
|
client_ip = match_obj.group("ip_address")
|
||||||
|
location = match_obj.group("location")
|
||||||
|
if not any(map(re.search, crawler_patterns, repeat(line))):
|
||||||
|
visit_dict[location].add(client_ip)
|
||||||
|
else:
|
||||||
|
bot_visit_dict[location].add(client_ip)
|
||||||
|
else:
|
||||||
|
# Also count lines that are NOT GET on a known page in a different dict.
|
||||||
|
match_obj = re.match(other_pages_regex, line)
|
||||||
if match_obj:
|
if match_obj:
|
||||||
client_ip = match_obj.group("ip_address")
|
client_ip = match_obj.group("ip_address")
|
||||||
location = match_obj.group("location")
|
location = match_obj.group("location")
|
||||||
last_log_date = match_obj.group("time_local")
|
# Those other hits are either ressource loaded, in this case we group the hits
|
||||||
user_agent = match_obj.group("user_agent")
|
|
||||||
if location in locations:
|
|
||||||
# For each line, if it is a GET on a known page, count it
|
|
||||||
if (not user_agent in bot_user_agents and
|
|
||||||
user_agent in client_user_agents or
|
|
||||||
not any(map(re.search, crawler_patterns,
|
|
||||||
repeat(user_agent)))):
|
|
||||||
visit_dict[location].add(client_ip)
|
|
||||||
client_user_agents.add(user_agent)
|
|
||||||
else:
|
|
||||||
bot_visit_dict[location].add(client_ip)
|
|
||||||
bot_user_agents.add(user_agent)
|
|
||||||
else:
|
|
||||||
# Also count lines that are NOT "GET on a known page" in a different dict.
|
|
||||||
# Those other hits can be static site ressources loaded,
|
|
||||||
# in which case we group the hits
|
|
||||||
if location.startswith("/isso/"):
|
if location.startswith("/isso/"):
|
||||||
other_visit_dict["/isso/*"].add(client_ip)
|
other_visit_dict["/isso/*"].add(client_ip)
|
||||||
elif location.startswith("/assets/css/"):
|
elif location.startswith("/assets/css/"):
|
||||||
|
|
@ -226,18 +200,9 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
elif location.startswith("/images/"):
|
elif location.startswith("/images/"):
|
||||||
other_visit_dict["/images/*"].add(client_ip)
|
other_visit_dict["/images/*"].add(client_ip)
|
||||||
else:
|
else:
|
||||||
# for everything else, we store the exact path, but not the query string
|
# for everything else, we store the exact path
|
||||||
other_visit_dict[location.split('?')[0]].add(client_ip)
|
other_visit_dict[location.split('?')[0]].add(client_ip)
|
||||||
today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
|
return visit_dict, bot_visit_dict, other_visit_dict
|
||||||
minute=0,
|
|
||||||
second=0,
|
|
||||||
microsecond=0)
|
|
||||||
additional_infos = {"last_log_timestamp": int(today_date.timestamp()),
|
|
||||||
"bot_user_agents_nb": len(bot_user_agents),
|
|
||||||
"client_user_agents_nb": len(client_user_agents)}
|
|
||||||
|
|
||||||
|
|
||||||
return visit_dict, bot_visit_dict, other_visit_dict, additional_infos
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
|
@ -269,16 +234,14 @@ def main() -> None:
|
||||||
# Get parser, get locations and parse the log file
|
# Get parser, get locations and parse the log file
|
||||||
crawler_patterns = get_crawler_patterns(args.exclude_crawler)
|
crawler_patterns = get_crawler_patterns(args.exclude_crawler)
|
||||||
locations = get_locations(args.sitemap)
|
locations = get_locations(args.sitemap)
|
||||||
visit_dict, bot_visit_dict, other_visit_dict, additional_infos = parse_logfile(args.logfile,
|
visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile,
|
||||||
locations,
|
locations,
|
||||||
crawler_patterns)
|
crawler_patterns)
|
||||||
|
|
||||||
# Generate the report
|
# Generate the report
|
||||||
print_visit_dict("Standard visits", visit_dict)
|
print_visit_dict("Standard visits", visit_dict)
|
||||||
print(f"There were {additional_infos['client_user_agents_nb']} unique client user agent(s)")
|
|
||||||
if args.exclude_crawler:
|
if args.exclude_crawler:
|
||||||
print_visit_dict("Bot visits", bot_visit_dict)
|
print_visit_dict("Bot visits", bot_visit_dict)
|
||||||
print(f"There were {additional_infos['bot_user_agents_nb']} unique bot user agent(s)")
|
|
||||||
print_visit_dict("Other visits", other_visit_dict)
|
print_visit_dict("Other visits", other_visit_dict)
|
||||||
|
|
||||||
if telegraf_url:
|
if telegraf_url:
|
||||||
|
|
@ -286,9 +249,7 @@ def main() -> None:
|
||||||
username=username,
|
username=username,
|
||||||
password=_password,
|
password=_password,
|
||||||
source=socket.gethostname())
|
source=socket.gethostname())
|
||||||
exporter.export_result_to_telegraf(visit_dict,
|
exporter.export_result_to_telegraf(visit_dict, bot_visit_dict)
|
||||||
bot_visit_dict,
|
|
||||||
additional_infos["last_log_timestamp"])
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue