reorganize the searching function
This commit is contained in:
parent
6ce2cea3a1
commit
508b838bbc
|
|
@ -176,9 +176,9 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
|
log_line_template = (r'^(?P<ip_address>[0-9a-f.:]+) \- .+? \[(?P<time_local>.*)\] '
|
||||||
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
|
'"GET (?P<location>{locations}) .*?" [0-9]{{3}} [0-9]+ ".+?" '
|
||||||
'"(?P<user_agent>.+)"$')
|
'"(?P<user_agent>.+)"$')
|
||||||
known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
|
#known_page_regex = re.compile(log_line_template.format(locations='|'.join(map(re.escape,
|
||||||
locations))))
|
# locations))))
|
||||||
other_pages_regex = re.compile(log_line_template.format(locations='.+?'))
|
log_regex = re.compile(log_line_template.format(locations='.+?'))
|
||||||
|
|
||||||
# Output data structure initialization
|
# Output data structure initialization
|
||||||
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
visit_dict: VisitDict = dict(map(lambda x: (x, set()), locations))
|
||||||
|
|
@ -195,23 +195,21 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
logfile_path="/dev/null"
|
logfile_path="/dev/null"
|
||||||
with open(logfile_path, 'r', encoding='utf-8') as logfile:
|
with open(logfile_path, 'r', encoding='utf-8') as logfile:
|
||||||
for line in logfile:
|
for line in logfile:
|
||||||
match_obj = re.match(known_page_regex, line)
|
match_obj = re.match(log_regex, line)
|
||||||
if match_obj:
|
if match_obj:
|
||||||
# For each line, check if it is a GET on a lnown page, and count those
|
|
||||||
client_ip = match_obj.group("ip_address")
|
client_ip = match_obj.group("ip_address")
|
||||||
location = match_obj.group("location")
|
location = match_obj.group("location")
|
||||||
if not any(map(re.search, crawler_patterns, repeat(line))):
|
|
||||||
visit_dict[location].add(client_ip)
|
|
||||||
else:
|
|
||||||
bot_visit_dict[location].add(client_ip)
|
|
||||||
bot_user_agents.add(match_obj.group("user_agent"))
|
|
||||||
last_log_date = match_obj.group("time_local")
|
last_log_date = match_obj.group("time_local")
|
||||||
else:
|
if location in locations:
|
||||||
# Also count lines that are NOT "GET on a known page" in a different dict.
|
# For each line, if it is a GET on a known page, count it
|
||||||
match_obj = re.match(other_pages_regex, line)
|
if not any(map(re.search, crawler_patterns,
|
||||||
if match_obj:
|
repeat(match_obj.group("user_agent")))):
|
||||||
client_ip = match_obj.group("ip_address")
|
visit_dict[location].add(client_ip)
|
||||||
location = match_obj.group("location")
|
else:
|
||||||
|
bot_visit_dict[location].add(client_ip)
|
||||||
|
bot_user_agents.add(match_obj.group("user_agent"))
|
||||||
|
else:
|
||||||
|
# Also count lines that are NOT "GET on a known page" in a different dict.
|
||||||
# Those other hits can be static site ressources loaded,
|
# Those other hits can be static site ressources loaded,
|
||||||
# in which case we group the hits
|
# in which case we group the hits
|
||||||
if location.startswith("/isso/"):
|
if location.startswith("/isso/"):
|
||||||
|
|
@ -225,7 +223,6 @@ def parse_logfile(logfile_path: str, locations: List[str],
|
||||||
else:
|
else:
|
||||||
# for everything else, we store the exact path, but not the query string
|
# for everything else, we store the exact path, but not the query string
|
||||||
other_visit_dict[location.split('?')[0]].add(client_ip)
|
other_visit_dict[location.split('?')[0]].add(client_ip)
|
||||||
last_log_date = match_obj.group("time_local")
|
|
||||||
today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
|
today_date = datetime.strptime(last_log_date, time_local_fmt).replace(hour=0,
|
||||||
minute=0,
|
minute=0,
|
||||||
second=0,
|
second=0,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue