"""Reader for fort.14 dataset files."""

__copyright__ = "(C) Copyright Aquaveo 2023"
__license__ = "All rights reserved"

# 1. Standard Python modules
from pathlib import Path
from typing import Sequence

# 2. Third party modules
import numpy as np

# 3. Aquaveo modules

# 4. Local modules
from ..file_reader import FileReader


def read_dataset(path: Path | str, num_datasets: int) -> list[Sequence[float]]:
    """
    Read a .fort.14 dataset file.

    This function assumes the file contains one or more datasets, but no geometry. Anything past the dataset definition
    is silently ignored.

    Each record in the file has an ID associated with it, and it's possible these IDs are out of order or have gaps. If
    either of these is true, their order in the resulting datasets is unspecified, but guaranteed to be consistent with
    the geometry reader. In other words, if a .fort.14 dataset and .fort.14 geometry file use the same set of IDs, then
    it is guaranteed that the value at location N in the dataset(s) will be the value that belongs with node N in the
    UGrid.

    Args:
        path: The file to read.
        num_datasets: The number of datasets the file contains. Note that the node ID, x-coordinate, and y-coordinate
            columns are discarded and thus not included in this count, so the count should generally be 3 less than the
            total number of columns in the file.

    Returns:
        The datasets read from the file.
    """
    num_columns = num_datasets + 3  # One extra column each for ID, x, and y.

    with FileReader(path, comment_markers=['!', '=']) as reader:
        reader.next_line()  # The first line is implicitly a comment.
        datasets = _read_datasets(reader, num_columns)

    lists = []
    for column in datasets.transpose()[3:]:
        lists.append(column.tolist())

    return lists


def _read_datasets(reader: FileReader, num_columns: int) -> np.ndarray:
    """Read all the datasets from a file."""
    used_ids = set()

    reader.read_int()  # Number of cells. Unused, since the connectivity section is ignored in datasets.
    num_records = reader.read_int()
    datasets = np.zeros((num_records, num_columns), dtype=float)
    pattern = [float] * num_columns
    pattern[0] = int

    for i in range(num_records):
        reader.next_line()
        node_id = reader.read_int()
        if node_id in used_ids:
            raise reader.error('Duplicate ID.')
        used_ids.add(node_id)

        record = [node_id]
        while len(record) < num_columns:
            record.append(reader.read_float())
        datasets[i] = record

    sorted_by_id = datasets[datasets[:, 0].argsort()]

    return sorted_by_id
