# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Most of this work is copyright (C) 2013-2021 David R. MacIver
# (david@drmaciver.com), but it contains contributions by others. See
# CONTRIBUTING.rst for a full list of people who may hold copyright, and
# consult the git log if you need to determine who owns an individual
# contribution.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
#
# END HEADER

import gzip
import json
import os
import sys
import tempfile
import unicodedata
from typing import Dict, Tuple

from hypothesis.configuration import mkdir_p, storage_directory
from hypothesis.errors import InvalidArgument

intervals = Tuple[Tuple[int, int], ...]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]


def charmap_file():
    return storage_directory(
        "unicode_data", unicodedata.unidata_version, "charmap.json.gz"
    )


_charmap = None


def charmap():
    """Return a dict that maps a Unicode category, to a tuple of 2-tuples
    covering the codepoint intervals for characters in that category.

    >>> charmap()['Co']
    ((57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    global _charmap
    # Best-effort caching in the face of missing files and/or unwritable
    # filesystems is fairly simple: check if loaded, else try loading,
    # else calculate and try writing the cache.
    if _charmap is None:
        f = charmap_file()
        try:
            with gzip.GzipFile(f, "rb") as i:
                tmp_charmap = dict(json.load(i))

        except Exception:
            # This loop is reduced to using only local variables for performance;
            # indexing and updating containers is a ~3x slowdown.  This doesn't fix
            # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
            category = unicodedata.category  # Local variable -> ~20% speedup!
            tmp_charmap = {}
            last_cat = category(chr(0))
            last_start = 0
            for i in range(1, sys.maxunicode + 1):
                cat = category(chr(i))
                if cat != last_cat:
                    tmp_charmap.setdefault(last_cat, []).append([last_start, i - 1])
                    last_cat, last_start = cat, i
            tmp_charmap.setdefault(last_cat, []).append([last_start, sys.maxunicode])

            try:
                # Write the Unicode table atomically
                tmpdir = storage_directory("tmp")
                mkdir_p(tmpdir)
                fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
                os.close(fd)
                # Explicitly set the mtime to get reproducible output
                with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
                    result = json.dumps(sorted(tmp_charmap.items()))
                    o.write(result.encode())

                os.renames(tmpfile, f)
            except Exception:
                pass

        # convert between lists and tuples
        _charmap = {
            k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
        }
        # each value is a tuple of 2-tuples (that is, tuples of length 2)
        # and that both elements of that tuple are integers.
        for vs in _charmap.values():
            ints = list(sum(vs, ()))
            assert all(isinstance(x, int) for x in ints)
            assert ints == sorted(ints)
            assert all(len(tup) == 2 for tup in vs)

    assert _charmap is not None
    return _charmap


_categories = None


def categories():
    """Return a tuple of Unicode categories in a normalised order.

    >>> categories() # doctest: +ELLIPSIS
    ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
    """
    global _categories
    if _categories is None:
        cm = charmap()
        _categories = sorted(cm.keys(), key=lambda c: len(cm[c]))
        _categories.remove("Cc")  # Other, Control
        _categories.remove("Cs")  # Other, Surrogate
        _categories.append("Cc")
        _categories.append("Cs")
    return tuple(_categories)


def as_general_categories(cats, name="cats"):
    """Return a tuple of Unicode categories in a normalised order.

    This function expands one-letter designations of a major class to include
    all subclasses:

    >>> as_general_categories(['N'])
    ('Nd', 'Nl', 'No')

    See section 4.5 of the Unicode standard for more on classes:
    https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf

    If the collection ``cats`` includes any elements that do not represent a
    major class or a class with subclass, a deprecation warning is raised.
    """
    if cats is None:
        return None
    major_classes = ("L", "M", "N", "P", "S", "Z", "C")
    cs = categories()
    out = set(cats)
    for c in cats:
        if c in major_classes:
            out.discard(c)
            out.update(x for x in cs if x.startswith(c))
        elif c not in cs:
            raise InvalidArgument(
                f"In {name}={cats!r}, {c!r} is not a valid Unicode category."
            )
    return tuple(c for c in cs if c in out)


def _union_intervals(x, y):
    """Merge two sequences of intervals into a single tuple of intervals.

    Any integer bounded by `x` or `y` is also bounded by the result.

    >>> _union_intervals([(3, 10)], [(1, 2), (5, 17)])
    ((1, 17),)
    """
    if not x:
        return tuple((u, v) for u, v in y)
    if not y:
        return tuple((u, v) for u, v in x)
    intervals = sorted(x + y, reverse=True)
    result = [intervals.pop()]
    while intervals:
        # 1. intervals is in descending order
        # 2. pop() takes from the RHS.
        # 3. (a, b) was popped 1st, then (u, v) was popped 2nd
        # 4. Therefore: a <= u
        # 5. We assume that u <= v and a <= b
        # 6. So we need to handle 2 cases of overlap, and one disjoint case
        #    |   u--v     |   u----v   |       u--v  |
        #    |   a----b   |   a--b     |  a--b       |
        u, v = intervals.pop()
        a, b = result[-1]
        if u <= b + 1:
            # Overlap cases
            result[-1] = (a, max(v, b))
        else:
            # Disjoint case
            result.append((u, v))
    return tuple(result)


def _subtract_intervals(x, y):
    """Set difference for lists of intervals. That is, returns a list of
    intervals that bounds all values bounded by x that are not also bounded by
    y. x and y are expected to be in sorted order.

    For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would
    return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the
    interval.
    """
    if not y:
        return tuple(x)
    x = list(map(list, x))
    i = 0
    j = 0
    result = []
    while i < len(x) and j < len(y):
        # Iterate in parallel over x and y. j stays pointing at the smallest
        # interval in the left hand side that could still overlap with some
        # element of x at index >= i.
        # Similarly, i is not incremented until we know that it does not
        # overlap with any element of y at index >= j.

        xl, xr = x[i]
        assert xl <= xr
        yl, yr = y[j]
        assert yl <= yr

        if yr < xl:
            # The interval at y[j] is strictly to the left of the interval at
            # x[i], so will not overlap with it or any later interval of x.
            j += 1
        elif yl > xr:
            # The interval at y[j] is strictly to the right of the interval at
            # x[i], so all of x[i] goes into the result as no further intervals
            # in y will intersect it.
            result.append(x[i])
            i += 1
        elif yl <= xl:
            if yr >= xr:
                # x[i] is contained entirely in y[j], so we just skip over it
                # without adding it to the result.
                i += 1
            else:
                # The beginning of x[i] is contained in y[j], so we update the
                # left endpoint of x[i] to remove this, and increment j as we
                # now have moved past it. Note that this is not added to the
                # result as is, as more intervals from y may intersect it so it
                # may need updating further.
                x[i][0] = yr + 1
                j += 1
        else:
            # yl > xl, so the left hand part of x[i] is not contained in y[j],
            # so there are some values we should add to the result.
            result.append((xl, yl - 1))

            if yr + 1 <= xr:
                # If y[j] finishes before x[i] does, there may be some values
                # in x[i] left that should go in the result (or they may be
                # removed by a later interval in y), so we update x[i] to
                # reflect that and increment j because it no longer overlaps
                # with any remaining element of x.
                x[i][0] = yr + 1
                j += 1
            else:
                # Every element of x[i] other than the initial part we have
                # already added is contained in y[j], so we move to the next
                # interval.
                i += 1
    # Any remaining intervals in x do not overlap with any of y, as if they did
    # we would not have incremented j to the end, so can be added to the result
    # as they are.
    result.extend(x[i:])
    return tuple(map(tuple, result))


def _intervals(s):
    """Return a tuple of intervals, covering the codepoints of characters in
    `s`.

    >>> _intervals('abcdef0123456789')
    ((48, 57), (97, 102))
    """
    intervals = tuple((ord(c), ord(c)) for c in sorted(s))
    return _union_intervals(intervals, intervals)


category_index_cache = {(): ()}


def _category_key(exclude, include):
    """Return a normalised tuple of all Unicode categories that are in
    `include`, but not in `exclude`.

    If include is None then default to including all categories.
    Any item in include that is not a unicode character will be excluded.

    >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])
    ('Me', 'Lu', 'Cs')
    """
    cs = categories()
    if include is None:
        include = set(cs)
    else:
        include = set(include)
    exclude = set(exclude or ())
    assert include.issubset(cs)
    assert exclude.issubset(cs)
    include -= exclude
    return tuple(c for c in cs if c in include)


def _query_for_key(key):
    """Return a tuple of codepoint intervals covering characters that match one
    or more categories in the tuple of categories `key`.

    >>> _query_for_key(categories())
    ((0, 1114111),)
    >>> _query_for_key(('Zl', 'Zp', 'Co'))
    ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
    """
    try:
        return category_index_cache[key]
    except KeyError:
        pass
    assert key
    if set(key) == set(categories()):
        result = ((0, sys.maxunicode),)
    else:
        result = _union_intervals(_query_for_key(key[:-1]), charmap()[key[-1]])
    category_index_cache[key] = result
    return result


limited_category_index_cache = {}  # type: cache_type


def query(
    exclude_categories=(),
    include_categories=None,
    min_codepoint=None,
    max_codepoint=None,
    include_characters="",
    exclude_characters="",
):
    """Return a tuple of intervals covering the codepoints for all characters
    that meet the criteria (min_codepoint <= codepoint(c) <= max_codepoint and
    any(cat in include_categories for cat in categories(c)) and all(cat not in
    exclude_categories for cat in categories(c)) or (c in include_characters)

    >>> query()
    ((0, 1114111),)
    >>> query(min_codepoint=0, max_codepoint=128)
    ((0, 128),)
    >>> query(min_codepoint=0, max_codepoint=128, include_categories=['Lu'])
    ((65, 90),)
    >>> query(min_codepoint=0, max_codepoint=128, include_categories=['Lu'],
    ...       include_characters=u'☃')
    ((65, 90), (9731, 9731))
    """
    if min_codepoint is None:
        min_codepoint = 0
    if max_codepoint is None:
        max_codepoint = sys.maxunicode
    catkey = _category_key(exclude_categories, include_categories)
    character_intervals = _intervals(include_characters or "")
    exclude_intervals = _intervals(exclude_characters or "")
    qkey = (
        catkey,
        min_codepoint,
        max_codepoint,
        character_intervals,
        exclude_intervals,
    )
    try:
        return limited_category_index_cache[qkey]
    except KeyError:
        pass
    base = _query_for_key(catkey)
    result = []
    for u, v in base:
        if v >= min_codepoint and u <= max_codepoint:
            result.append((max(u, min_codepoint), min(v, max_codepoint)))
    result = tuple(result)
    result = _union_intervals(result, character_intervals)
    result = _subtract_intervals(result, exclude_intervals)
    limited_category_index_cache[qkey] = result
    return result