First version of the script (#1)
Création du script Co-authored-by: Hugo <saxodwarf@saxodwarf.fr> Reviewed-on: #1 Co-authored-by: saxodwarf <saxodwarf@noreply.localhost> Co-committed-by: saxodwarf <saxodwarf@noreply.localhost>
This commit is contained in:
parent
b2058509c8
commit
c3eb31258e
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/env python3
|
||||
""" Script to parse a sitemap.xml file,
|
||||
then look through a NGINX log file for the number of hits for each of the URLs
|
||||
defined in the sitemap, by unique IP.
|
||||
"""
|
||||
import re
|
||||
import json
|
||||
import argparse
|
||||
from itertools import repeat
|
||||
from subprocess import run
|
||||
from urllib.parse import urlparse
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Collect number of daily loading of each page in the nginx log file.')
|
||||
parser.add_argument("-s", "--sitemap", default="/var/www/my_webapp/www/sitemap.xml",
|
||||
help="Path to the sitemap xml file for the website.")
|
||||
parser.add_argument("-l", "--logfile", default="/var/log/nginx/saxodwarf.fr-access.log.1",
|
||||
help="Path to the log file to analyze")
|
||||
parser.add_argument("-e", "--exclude-crawler", action="store_true",
|
||||
help="If set, uses a crawler-user-agent.json file to exclude requests made by bots.")
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
""" Parses the arguments, the crawler file and the sitemap,
|
||||
then for each locations, uses grep to select the lines containing GET calls for
|
||||
the location, and prints the number of unique IP accessing it.
|
||||
"""
|
||||
args = parse_args()
|
||||
|
||||
if args.exclude_crawler:
|
||||
try:
|
||||
with open("./crawler-user-agents.json", 'r') as crawler_file:
|
||||
crawlers = json.load(crawler_file)
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
print("Could not open the crawler user agent file")
|
||||
crawlers = []
|
||||
else:
|
||||
crawlers = []
|
||||
# Crawlers patterns are built once and for all for speed
|
||||
crawler_patterns = [re.compile(entry["pattern"]) for entry in crawlers]
|
||||
|
||||
locations = []
|
||||
tree = ET.parse(args.sitemap)
|
||||
root = tree.getroot()
|
||||
# Get the default XML namespace, needed for tag lookup later
|
||||
ns = re.match(r'{.*}', root.tag).group(0)
|
||||
for url in root:
|
||||
locations.append(urlparse(url.find(f"{ns}loc").text).path)
|
||||
|
||||
for path in locations:
|
||||
# Pre-process log file using grep, to keep only interesting lines
|
||||
cmd = ["grep", "-e", f'GET {path} ', args.logfile]
|
||||
process = run(cmd, capture_output=True, text=True)
|
||||
# Silmutaneously keep only unique source IP and exclude crawlers if resquested
|
||||
lines = {line.split(' ')[0] for line in process.stdout.splitlines() if not any(map(re.search, crawler_patterns, repeat(line)))}
|
||||
|
||||
print(f"{path}: {len(lines)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue