#!/usr/bin/env python3
"""
Extract responses from a HAR file, which can be exported by browser dev tools.
Files are written for URLs that match a certain regular expression.
"""
# XXX: Warning - Quick best effort script, i.e. no proper error handling.
import re
import sys
import json
import base64
import argparse
from typing import Dict, Iterator, Tuple, Optional
def parse_har_log(filename: str) -> Iterator[Tuple[Dict, Dict]]:
with open(filename, "r") as fp:
har = json.load(fp)
if not isinstance(har, dict) or \
"log" not in har or not isinstance(har["log"], dict)\
or "entries" not in har["log"] or not isinstance(har["log"]["entries"], list):
raise ValueError(f"No/invalid log entries in {filename}")
yield from ((_["request"], _["response"]) for _ in har["log"]["entries"])
def match_request(request: Dict, url_rx: re.Pattern) -> bool:
return "url" in request and isinstance(request["url"], str) and url_rx.match(request["url"]) is not None
def match_response(response: Dict) -> bool:
return "status" in response and isinstance(response["status"], int) and response["status"] == 200
def dump_response(response: Dict, filename: str) -> bool:
if "content" in response and isinstance(response["content"], dict):
content: Dict = response["content"]
if "text" in content and isinstance(content["text"], str):
if "encoding" not in content:
data: bytes = content["text"].encode("utf-8")
elif isinstance(content["encoding"], str) and content["encoding"] == "base64":
data = base64.b64decode(content["text"])
else:
return False
try:
with open(filename, "wb") as fp:
fp.write(data)
return True
except OSError:
return False
return False
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--url", type=str, default=None,
help="regular expression for the request url, no response content dump otherwise")
parser.add_argument("file", metavar="archive.har",
help="http archive file to process")
args = parser.parse_args()
filename: str = args.file
url_rx: Optional[re.Pattern] = re.compile(args.url) if args.url is not None else None
for request, response in parse_har_log(filename):
url: str = request["url"]
fn: str = re.sub('[^a-zA-Z0-9_-]', '_', url)
if url_rx is not None and match_request(request, url_rx) and match_response(response):
if dump_response(response, fn):
print(f"DUMP: {url}")
else:
print(f"FAIL: {url}")
else:
print(f"SKIP: {url}")
return 0
if __name__ == "__main__":
sys.exit(main())