#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this file. If not, see <http://www.gnu.org/licenses/>.
#
#   Copyright © 2016-2023 The University of Tromsø & the Norwegian Sámi Parliament
#   http://giellatekno.uit.no & http://divvun.no
#
"""Manage corpus files in various ways."""


import collections
import difflib
import os
import shutil
import sys

import lxml.html
import urlparse

from corpustools import adder, move_files, namechanger, util, xslsetter


def main():
    # find_files_without_parallels()
    # find_not_analysed(sys.argv[1])
    # fix_pdf_filenames(sys.argv[1])
    move_twenty_percent_to_goldcorpus()


def print_equality_ratios_in_dir():
    ratios = set()
    for root, dirs, files in os.walk(sys.argv[1]):
        for f in files:
            if f.endswith(".html") and sys.argv[2] in f:
                path = os.path.join(root, f)
                sm = difflib.SequenceMatcher(
                    a=path, b=path.replace(sys.argv[2], sys.argv[3])
                )
                ratios.add(round(sm.ratio(), 2))

    for ratio in ratios:
        print(ratio)


def remove_files_with_duplicate_content():
    """To replace: 123, , 339, 340"""
    ufflangs = {
        "fin": "finnish",
        "eng": "english",
        "sme": "davvi",
        "smn": "anaras",
        "sms": "nuortta",
    }

    this_lang = "sms"

    fingetter = adder.AddToCorpus(
        str(os.getenv("GTFREE")), "fin", "admin/sd/www.samediggi.fi"
    )
    smsgetter = adder.AddToCorpus(
        str(os.getenv("GTFREE")), this_lang, "admin/sd/www.samediggi.fi"
    )
    for root, dirs, files in os.walk(
        os.path.join(
            os.getenv("GTFREE"), "orig", this_lang, "admin/sd/www.samediggi.fi"
        )
    ):
        print(root)
        for f in files:
            if f.endswith(".xsl") and "itemid=256" in f:
                path = os.path.join(root, f)
                mdh = xslsetter.MetadataHandler(path)
                filename = mdh.get_variable("filename")

                parallellfile = path.replace("/" + this_lang + "/", "/fin/")
                parallellfile = parallellfile.replace(".xsl", "")
                parallellfile = parallellfile.replace(
                    "lang=" + ufflangs[this_lang], "lang=finnish"
                )
                parallellfile = parallellfile.replace("itemid=256", "itemid=195")

                if not os.path.exists(parallellfile):
                    if this_lang != "fin":
                        fingetter.copy_url_to_corpus(
                            filename.replace("Itemid=256", "Itemid=195").replace(
                                "lang=" + ufflangs[this_lang], "lang=finnish"
                            )
                        )

                smsgetter.copy_url_to_corpus(
                    filename.replace("Itemid=256", "Itemid=195"),
                    parallelpath=parallellfile,
                )
                move_files.mover(path.replace(".xsl", ""), "")

    smsgetter.add_files_to_working_copy()
    fingetter.add_files_to_working_copy()


def adder_adderexception_invalid_url():
    langs = ["davvi", "finnish", "nuortta", "anaras", "english"]
    downloader = adder.UrlDownloader(os.path.join(os.getenv("GTFREE"), "klaff"))
    for lang in langs:
        try:
            (r, tmpname) = downloader.download(
                "http://www.samediggi.fi/index2.php?option=com_content&task=view&id=420&pop=1&page=0&Itemid=149",
                params={"lang": lang},
            )
        except adder.AdderException as e:
            print("her gikk det galt", str(e))


def print_finder():
    langs = {
        "eng": "english",
        "fin": "finnish",
        "sme": "davvi",
        "smn": "anaras",
        "sms": "nuortta",
    }

    file_count = 0
    img_count = 0

    downloader = adder.UrlDownloader(os.path.join(os.getenv("GTFREE"), "tmp"))

    for lang in langs.keys():
        for root, dirs, files in os.walk(
            os.path.join(os.getenv("GTFREE"), "orig", lang, "admin/sd/www.samediggi.fi")
        ):
            for f in files:
                if f.endswith(".html"):
                    file_count += 1
                    path = os.path.join(root, f)
                    tree = lxml.html.parse(path)
                    print_img = tree.find(
                        './/img[@src="http://www.samediggi.fi/images/M_images/printButton.png"]'
                    )
                    if print_img is not None:
                        img_count += 1
                        parent = print_img.getparent()
                        href = urlparse.urlparse(parent.get("href"))

                        query = href.query
                        newquery = [
                            part
                            for part in query.split("&")
                            if (
                                part.startswith("option")
                                or part.startswith("id")
                                or part.startswith("task")
                            )
                        ]
                        newquery.append("lang=" + langs[lang])

                        newhref = urlparse.urlunparse(
                            (
                                href.scheme,
                                href.netloc,
                                href.path,
                                href.params,
                                "&".join(newquery),
                                href.fragment,
                            )
                        )

                        print("about to download", newhref)
                        (r, tmpname) = downloader.download(newhref)

                        newname = (
                            namechanger.normalise_filename(os.path.basename(newhref))
                            + ".html"
                        )

                        newpath = os.path.join(root, newname)
                        with open(newpath, "w") as newfile:
                            newfile.write(r.content)
                        print("written", newpath)
                        print()

                    else:
                        print("!!!!!!")


def remove_if_no_smX():
    langs = {
        "eng": "english",
        "fin": "finnish",
        "sme": "davvi",
        "smn": "anaras",
        "sms": "nuortta",
    }

    file_count = 0

    adder.UrlDownloader(os.path.join(os.getenv("GTFREE"), "tmp"))

    for lang1 in ["eng", "fin"]:
        for root, dirs, files in os.walk(
            os.path.join(
                os.getenv("GTFREE"), "orig", lang1, "admin/sd/www.samediggi.fi"
            )
        ):
            for f in files:
                if f.endswith(".html"):
                    file_count += 1
                    path = os.path.join(root, f)

                    smx_exists = False
                    for lang2 in ["sme", "smn", "sms"]:
                        smxpath = path.replace("orig/" + lang1, "orig/" + lang2)
                        smxpath = smxpath.replace(
                            "_lang=" + langs[lang1], "_lang=" + langs[lang2]
                        )
                        if os.path.exists(smxpath):
                            smx_exists = True

                    if smx_exists is False:
                        move_files.mover(path, "")


def find_files_without_parallels():
    url_to_filename = {}
    for root, dirs, files in os.walk(sys.argv[1]):
        for f in files:
            if f.endswith(".xsl"):
                file_ = os.path.join(root, f)
                mdh = xslsetter.MetadataHandler(file_)
                url_to_filename[mdh.get_variable("filename")] = file_

    urlset = set(url_to_filename.keys())

    newcounter = 0
    oldcounter = 0
    print_part = "layout/set/print"
    downloader = adder.UrlDownloader(os.path.join(os.getenv("GTFREE"), "tmp"))
    for url in urlset:
        if print_part not in url and ".aspx" not in url:
            parts = urlparse.urlsplit(url)
            nurl = urlparse.urlunparse(
                (
                    parts.scheme,
                    os.path.join(parts.netloc, print_part),
                    parts.path,
                    "",
                    parts.query,
                    parts.fragment,
                )
            )
            util.print_frame(debug=nurl)
            try:
                (r, tmpname) = downloader.download(nurl)
                newfilename = namechanger.normalise_filename(os.path.basename(tmpname))
                newpath = os.path.join(
                    os.path.dirname(url_to_filename[url]), newfilename
                )
                oldpath = url_to_filename[url].replace(".xsl", "")

                if os.path.exists(newpath) and oldpath != newpath:
                    shutil.copy(tmpname, newpath)
                    move_files.mover(oldpath, "")
                    mdh = xslsetter.MetadataHandler(newpath + ".xsl")
                    mdh.set_variable("filename", nurl)
                    mdh.write_file()
                elif oldpath != newpath:
                    mdh = xslsetter.MetadataHandler(oldpath + ".xsl")
                    mdh.set_variable("filename", nurl)
                    mdh.write_file()
                    move_files.mover(oldpath, newpath)
                else:
                    mdh = xslsetter.MetadataHandler(oldpath + ".xsl")
                    mdh.set_variable("filename", nurl)
                    mdh.write_file()
                    shutil.copy(tmpname, oldpath)

            except adder.AdderException as e:
                util.print_frame(debug=str(e))

    print("new files", newcounter)
    print("old files", oldcounter)


def find_not_analysed(directory):
    for root, dirs, files in os.walk(directory):
        for f in files:
            ana_root = root.replace("converted/", "analysed/")
            ana_file = os.path.join(ana_root, f)
            orig_root = root.replace("converted/", "orig/")
            orig_file = os.path.join(orig_root, f.replace(".xml", ""))
            if (
                "plenum_no/dc" not in ana_file
                and not os.path.exists(ana_file)
                and os.path.exists(orig_file)
            ):
                print(os.path.join(root, f))


def find_no_sami_parallel(directory):
    for root, dirs, files in os.walk(directory):
        for f in files:
            if f.endswith(".xsl"):
                file_ = os.path.join(root, f)
                mdh = xslsetter.MetadataHandler(file_)
                p = mdh.get_parallel_texts()

                for lang in p.keys():
                    if lang in ["sme", "sma", "smj", "smn", "sms"]:
                        break
                else:
                    move_files.mover(file_[:-4], "")


def fix_pdf_filenames(directory):
    for root, dirs, files in os.walk(directory):
        for f in files:
            if f.endswith(".pdf"):
                file_ = os.path.join(root, f).decode("utf8")
                mdh = xslsetter.MetadataHandler(file_ + ".xsl")
                url = mdh.get_variable("filename")
                # print(file_, url)

                downloader = adder.UrlDownloader(
                    os.path.join(os.getenv("GTFREE"), "tmp")
                )
                with util.ignored(KeyError):
                    try:
                        (r, tmpname) = downloader.download(url)
                    except adder.AdderException as e:
                        print(str(e))
                    else:
                        newfilename = namechanger.normalise_filename(
                            os.path.basename(tmpname)
                        )
                        if newfilename != f:
                            util.print_frame(debug=newfilename)
                            move_files.mover(file_, os.path.join(root, newfilename))


def move_twenty_percent_to_goldcorpus():
    """Move twenty percent of the files to the goldcorpus"""
    directories = [
        "orig/sme/admin/sd/cealkamusat_fi",
        "orig/sme/admin/sd/davviriikkalas_samekonvensuvdna_fi",
        "orig/sme/admin/sd/inaugurations_fi",
        "orig/sme/admin/sd/ohcan_lahkai_fi",
        "orig/sme/admin/sd/sami_parlamentarals_raddi_fi",
        "orig/sme/admin/sd/www.samediggi.fi",
        "orig/sme/facta/samediggi.fi/",
    ]

    fluff = collections.defaultdict(list)
    for dir in directories:
        for root, dirs, files in os.walk(os.path.join(os.getenv("GTFREE"), dir)):
            for f in files:
                if f.endswith(".xsl"):
                    name = os.path.join(root, f[:-4])
                    size = os.path.getsize(name)
                    fluff[size].append(name)

    i = 0
    for size in sorted(list(fluff.keys()), reverse=True):
        for f in fluff[size]:
            if i == 4:
                move_files.mover(f, f.replace("orig/", "goldstandard/orig/"))
                i = 0
            i += 1