From 6644749faba6b190becbdc5b0ab4f410ed7bf550 Mon Sep 17 00:00:00 2001 From: Hugo Date: Sat, 18 Sep 2021 13:40:00 +0200 Subject: [PATCH] huge improvements --- .gitignore | 5 +- get_page_stats.py | 235 ++++++++++++++++++++++++++++++++++++---------- requirements.txt | 1 + 3 files changed, 188 insertions(+), 53 deletions(-) mode change 100644 => 100755 get_page_stats.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index f8b73e7..a5ac2b0 100644 --- a/.gitignore +++ b/.gitignore @@ -109,7 +109,7 @@ celerybeat.pid .env .venv env/ -venv/ +*venv/ ENV/ env.bak/ venv.bak/ @@ -138,3 +138,6 @@ dmypy.json # Cython debug symbols cython_debug/ +# config +*.conf +crawler-user-agents.json diff --git a/get_page_stats.py b/get_page_stats.py old mode 100644 new mode 100755 index ae2ecf9..0054a5c --- a/get_page_stats.py +++ b/get_page_stats.py @@ -1,67 +1,175 @@ -#!/bin/env python3 +#!/usr/bin/env python3 """ Script to parse a sitemap.xml file, then look through a NGINX log file for the number of hits for each of the URLs defined in the sitemap, by unique IP. """ +import os import re +import sys import json +import socket +import getpass import argparse +import subprocess +import configparser +from datetime import datetime, time from collections import defaultdict from itertools import repeat -from subprocess import run from urllib.parse import urlparse +from typing import Dict, List, Tuple, Set import xml.etree.ElementTree as ET +import requests -def parse_args(): - parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.') +VisitDict = Dict[str, Set[str]] + + +def parse_args()-> argparse.Namespace: + """ Parse arguments of the script + """ + parser = argparse.ArgumentParser(description='Collect number of daily loading of each page ' + 'in the nginx log file.') parser.add_argument("-s", "--sitemap", default="/var/www/html/sitemap.xml", help="Path to the sitemap xml file for the website.") parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1", help="Path to the log file to analyze") parser.add_argument("-e", "--exclude-crawler", action="store_true", - help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.") + help="If set, uses a crawler-user-agent.json file to exclude requests " + "made by bots.") + parser.add_argument("-t", "--telegraf-url", + help="URL for a telegraf http listener v2") + parser.add_argument("-u", "--user", + help="Username for the telegraf export") + parser.add_argument("-c", "--config-file", + help="Configuration file for the URL, the username and password of " + "the exporter") return parser.parse_args() -def main(): - """ Parses the arguments, the crawler file and the sitemap, - Then reads the log file line by line, regexes through it to isolate locations and client IP - It records the number of unique IP accessing each known pages (from the sitemap), and - the number of unique IP accessing each unknown locations. - (either ressources being loaded or bot looking for vulnerable website). +def print_visit_dict(title:str, visit_dict: VisitDict)-> None: + """ Pretty-print a visit dictionnary + Keys are locations, values are list of IPs. """ - args = parse_args() + total_visits=0 + print(f'======== {title} ========') + for loc, ips in visit_dict.items(): + print(f"{loc}: {len(ips)}") + total_visits += len(ips) + print(f'Total visits for {title}: {total_visits}') - if args.exclude_crawler: +class TelegrafExporter(): + """ A class to export viti count to a telegraf instance using the http listener v2 + input plugin + """ + def __init__(self, telegraf_url: str, username: str, password: str, source: str): + self.telegraf_url = telegraf_url + self.username = username + self._password = password + self.source = source + + def telegraf_post(self, timestamp:int, title:str, location:str, count:int)-> requests.Response: + """ Post a value to telegraf + """ + payload = {"name": title, + "timestamp": timestamp, + "source": self.source, + "location": location, + "hits": count} + return requests.post(self.telegraf_url, + json=payload, + auth=(self.username, self._password)) + + + def export_result_to_telegraf(self, page_hits: VisitDict, bot_hits: VisitDict) -> None: + """ Export the bot_hits and page_hits dictionnaries to telegraf + """ + # export standard hits + timestamp = int(datetime.combine(datetime.now().date(), time()).timestamp()) + name="blog_client_hit" + for location, ips in page_hits.items(): + try: + response = self.telegraf_post(timestamp, + name, + location, + len(ips)) + response.raise_for_status() + except requests.exceptions.RequestException as excpt: + print(excpt) + sys.exit(1) + # export bots hits + name="blog_bot_hit" + for location, ips in bot_hits.items(): + try: + response = self.telegraf_post(timestamp, + name, + location, + len(ips)) + response.raise_for_status() + except requests.exceptions.RequestException as excpt: + print(excpt) + sys.exit(1) + +def get_crawler_patterns(exclude_crawler: bool) -> List[str]: + """ Parse the crawler-user-agent file, and returns a list + of compiled regex crawler patterns + """ + if exclude_crawler: + base_path = os.path.dirname(os.path.abspath(__file__)) + crawler_path = os.path.join(base_path, "crawler-user-agents.json") + if not os.path.exists(crawler_path) or os.path.getsize(crawler_path) == 0: + # retrieve the crawler file from github + cmd = ["wget", "-O", crawler_path, + "https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/" + "crawler-user-agents.json"] + subprocess.run(cmd, check=False) try: - with open("./crawler-user-agents.json", 'r') as crawler_file: + with open(crawler_path, 'r', encoding='utf-8') as crawler_file: crawlers = json.load(crawler_file) except (FileNotFoundError, json.JSONDecodeError): - print("Could not open the crawler user agent file") + print("Could not open and use the crawler user agent file") crawlers = [] else: crawlers = [] # Crawlers patterns are built once and for all for speed crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers] + return crawler_patterns +def get_locations(sitemap_path:str) -> List[str]: + """ Parse a sitemap file, and return the list of all its locations + """ locations = [] - tree = ET.parse(args.sitemap) + tree = ET.parse(sitemap_path) root = tree.getroot() # Get the default XML namespace, needed for tag lookup later - ns = re.match(r'{.*}', root.tag).group(0) + match_nsp = re.match(r'{.*}', root.tag) + nsp = match_nsp.group(0) if match_nsp else "" for url in root: - locations.append(urlparse(url.find(f"{ns}loc").text).path) + loc_elmt = url.find(f"{nsp}loc") + if loc_elmt is not None: + locations.append(str(urlparse(loc_elmt.text).path)) + return locations + +def parse_logfile(logfile_path: str, locations: List[str], + crawler_patterns: List[str]) -> Tuple[VisitDict, VisitDict, VisitDict]: + """ Parse a logfile, and return 3 dicts: + page_hits, bot_hits and other_hits + """ + # Regexes for all the pattern matching log_line_template = r'^(?P[0-9a-f.:]+) .*"GET (?P{locations}) .*' - known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, locations)))) + known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape, + locations)))) other_pages_regex = re.compile(log_line_template.format(locations='.+?')) - visit_dict = dict(map(lambda x: (x, set()), locations)) - bot_visit_dict = dict(map(lambda x: (x, set()), locations)) - other_visits = defaultdict(set) - with open(args.logfile, 'r') as logfile: + + # Output data structure initialization + visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) + bot_visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations)) + other_visit_dict: VisitDict = defaultdict(set) + + with open(logfile_path, 'r', encoding='utf-8') as logfile: for line in logfile: match_obj = re.match(known_page_regex, line) if match_obj: + # For each line, check if it is a GET on a lnown page, and count those client_ip = match_obj.group("ip_address") location = match_obj.group("location") if not any(map(re.search, crawler_patterns, repeat(line))): @@ -69,48 +177,71 @@ def main(): else: bot_visit_dict[location].add(client_ip) else: + # Also count lines that are NOT GET on a known page in a different dict. match_obj = re.match(other_pages_regex, line) if match_obj: client_ip = match_obj.group("ip_address") location = match_obj.group("location") + # Those other hits are either ressource loaded, in this case we group the hits if location.startswith("/isso/"): - other_visits["/isso/*"].add(client_ip) + other_visit_dict["/isso/*"].add(client_ip) elif location.startswith("/assets/css/"): - other_visits["/assets/css/*"].add(client_ip) + other_visit_dict["/assets/css/*"].add(client_ip) elif location.startswith("/assets/js/"): - other_visits["/assets/js/*"].add(client_ip) + other_visit_dict["/assets/js/*"].add(client_ip) elif location.startswith("/images/"): - other_visits["/images/*"].add(client_ip) + other_visit_dict["/images/*"].add(client_ip) else: - other_visits[location.split('?')[0]].add(client_ip) + # for everything else, we store the exact path + other_visit_dict[location.split('?')[0]].add(client_ip) + return visit_dict, bot_visit_dict, other_visit_dict - total_visits=0 - print("Standard visits:") - for loc, ips in visit_dict.items(): - print(f"{loc}: {len(ips)}") - total_visits += len(ips) - print(f'Total visits: {total_visits}') +def main() -> None: + """ Parses the arguments, the crawler file and the sitemap, + Then reads the log file line by line, regexes through it to isolate locations and client IP + It records the number of unique IP accessing each known pages (from the sitemap), and + the number of unique IP accessing each unknown locations. + (either ressources being loaded or bot looking for vulnerable website). + """ + args = parse_args() + telegraf_url = "" + + # Read config file + if args.config_file: + config = configparser.ConfigParser() + config.read(args.config_file) + try: + username = config["telegraf"]["username"] + telegraf_url = config["telegraf"]["url"] + _password = config["telegraf"]["password"] + except KeyError as excpt: + print(f"Error: missing key in configuration file '{args.config_file}': {excpt.args[0]}") + sys.exit(1) + elif args.telegraf_url: + telegraf_url = args.telegraf_url + username = args.user if args.user else input("Telegraf username: ") + _password = getpass.getpass("Telegraf password: ") + + # Get parser, get locations and parse the log file + crawler_patterns = get_crawler_patterns(args.exclude_crawler) + locations = get_locations(args.sitemap) + visit_dict, bot_visit_dict, other_visit_dict = parse_logfile(args.logfile, + locations, + crawler_patterns) + + # Generate the report + print_visit_dict("Standard visits", visit_dict) if args.exclude_crawler: - print("Bot visits:") - for loc, ips in bot_visit_dict.items(): - print(f"{loc}: {len(ips)}") - nb_other_visits = 0 - print("Other visits:") - for loc, ips in other_visits.items(): - print(f"{loc}: {len(ips)}") - nb_other_visits += len(ips) - print(f'Total visits: {total_visits}') - print(f'Other visits: {nb_other_visits}') + print_visit_dict("Bot visits", bot_visit_dict) + print_visit_dict("Other visits", other_visit_dict) - #for path in locations: - # # Pre-process log file using grep, to keep only interesting lines - # cmd = ["grep", "-e", f'GET {path} ', args.logfile] - # process = run(cmd, capture_output=True, text=True) - # # Silmutaneously keep only unique source IP and exclude crawlers if resquested - # lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))} - # - # print(f"{path}: {len(lines)}") + if telegraf_url: + exporter = TelegrafExporter(telegraf_url=telegraf_url, + username=username, + password=_password, + source=socket.gethostname()) + exporter.export_result_to_telegraf(visit_dict, bot_visit_dict) if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f229360 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests