#!/usr/bin/env python3

"""
Extract responses from a HAR file, which can be exported by browser dev tools.
Files are written for URLs that match a certain regular expression.
"""
# XXX: Warning - Quick best effort script, i.e. no proper error handling.


import re
import sys
import json
import base64
import argparse

from typing import Dict, Iterator, Tuple, Optional


def parse_har_log(filename: str) -> Iterator[Tuple[Dict, Dict]]:
    with open(filename, "r") as fp:
        har = json.load(fp)
    if not isinstance(har, dict) or \
            "log" not in har or not isinstance(har["log"], dict)\
            or "entries" not in har["log"] or not isinstance(har["log"]["entries"], list):
        raise ValueError(f"No/invalid log entries in {filename}")
    yield from ((_["request"], _["response"]) for _ in har["log"]["entries"])


def match_request(request: Dict, url_rx: re.Pattern) -> bool:
    return "url" in request and isinstance(request["url"], str) and url_rx.match(request["url"]) is not None


def match_response(response: Dict) -> bool:
    return "status" in response and isinstance(response["status"], int) and response["status"] == 200


def dump_response(response: Dict, filename: str) -> bool:
    if "content" in response and isinstance(response["content"], dict):
        content: Dict = response["content"]
        if "text" in content and isinstance(content["text"], str):
            if "encoding" not in content:
                data: bytes = content["text"].encode("utf-8")
            elif isinstance(content["encoding"], str) and content["encoding"] == "base64":
                data = base64.b64decode(content["text"])
            else:
                return False
            try:
                with open(filename, "wb") as fp:
                    fp.write(data)
                    return True
            except OSError:
                return False
    return False


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--url", type=str, default=None,
                        help="regular expression for the request url, no response content dump otherwise")
    parser.add_argument("file", metavar="archive.har",
                        help="http archive file to process")

    args = parser.parse_args()
    filename: str = args.file
    url_rx: Optional[re.Pattern] = re.compile(args.url) if args.url is not None else None

    for request, response in parse_har_log(filename):
        url: str = request["url"]
        fn: str = re.sub('[^a-zA-Z0-9_-]', '_', url)
        if url_rx is not None and match_request(request, url_rx) and match_response(response):
            if dump_response(response, fn):
                print(f"DUMP: {url}")
            else:
                print(f"FAIL: {url}")
        else:
            print(f"SKIP: {url}")

    return 0


if __name__ == "__main__":
    sys.exit(main())