# This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright © 2012-2023 The University of Tromsø & # the Norwegian Sámi Parliament # http://giellatekno.uit.no & http://divvun.no # """Convert pdf files to the Giella xml format.""" import collections import re from copy import deepcopy from lxml import etree from corpustools import basicconverter, util, xslsetter LETTER_AT_START = re.compile(r"[^\W\d_].*", re.UNICODE) LETTER_HYPHEN_AT_END = re.compile(r".*[^\W\d_]-$", re.UNICODE) def styles(page_style): """Turn inline css styles into a dict.""" styles = {} for style_pair in page_style.split(";"): if style_pair: values = style_pair.split(":") styles[values[0]] = values[1].replace("px", "") return styles def merge(first, second): """Merge two paragraph elements into one.""" if len(first): first[-1].tail = ( f"{first[-1].tail if first[-1].tail else ''}" f"{second.text if second.text else ''}" ) elif second.text: # The tail of the second p is not important, it is always empty first.text = f"{first.text if first.text else ''}{second.text}" for child in second: first.append(child) return first def merge_i(first, second): """Merge two elements into one.""" if len(first): if second.text: if first[-1].tail: first[-1].tail = f"{first[-1].tail}{second.text}" else: first[-1].tail = second.text else: text = first.text if first.text else "" tail = first.tail if first.tail else "" first.text = f"{text}{tail}{second.text if second.text else ''}" first.tail = second.tail for child in second: first.append(child) return first def merge_children_of_p(paragraph): if len(paragraph) > 1: new_paragraph = etree.Element("p") new_paragraph.text = paragraph.text child = etree.Element(paragraph[0].tag) child.text = paragraph[0].text child.tail = paragraph[0].tail for next_child in paragraph[1:]: if next_child.tag == child.tag and ( child.tail is None or not child.tail.strip() ): child = merge_i(child, next_child) else: new_paragraph.append(child) child = deepcopy(next_child) new_paragraph.append(child) return new_paragraph else: return paragraph def is_probably_hyphenated(previous, current): """Find out if previous is part of a hyphenated word. Args: previous (str): the previous string in front of a particular br tag current (str): the current string following a particular br tag Returns: (bool): True if previous is part of a hyphenated word, False otherwise """ previous1 = previous[-2:] current1 = current[:2] return ( LETTER_HYPHEN_AT_END.match(previous1) and LETTER_AT_START.match(current1) and current[0] == current[0].lower() ) def handle_br(previous, current): """Handle br tags in p elements. Args: previous (str): the previous string in front of a particular br tag current (str): the current string following a particular br tag Returns: (str): A possibly modified version of previous """ # Remove hyphen if is_probably_hyphenated(previous, current): return previous[:-1] # Preserve hyphen if previous and previous[-1] == "-": return previous # Turn br tag into space return f"{previous} " PDFFontspec = collections.namedtuple("PDFFontspec", ["size", "family", "color"]) class PDFFontspecs: """Add font specs found in a pdf page to this class. Attributes: pdffontspecs (dict[PDFFontspec, int]): map fontspecs to fontspec ids. duplicates (dict[str, str]): map ids of duplicate fontspecs to the id of the first instance of this fontspec. """ def __init__(self): """Initialise the PDFFontspecs class.""" self.pdffontspecs = {} self.duplicates = {} def add_fontspec(self, xmlfontspec): """Add a pdf2xml fontspec to this class. Args: xmlfontspec (etree.Element): a PDF2XML fontspec element found in a PDF2XML page element. """ this_id = xmlfontspec.get("id") this_fontspec = PDFFontspec( size=xmlfontspec.get("size"), family=xmlfontspec.get("family"), color=xmlfontspec.get("color"), ) for fontspec in list(self.pdffontspecs.keys()): if fontspec == this_fontspec: self.duplicates[this_id] = self.pdffontspecs[fontspec] break else: self.pdffontspecs[this_fontspec] = this_id def corrected_id(self, font_id): """Return a corrected id of a fontspec. Some xmlfontspecs have different id's for an identical font. This function makes sure identical fonts have identical id's. Args: font_id (int): an integer that is the id of the fontspec. Returns: (int): an integer that is the corrected id of the fontspec. """ if font_id in self.duplicates: return self.duplicates[font_id] else: return font_id class PDFEmptyPageError(Exception): """Raise this exception if a pdf page is empty.""" class PDFPageMetadata: """Read pdf metadata from the metadata file into this class. Compute metadata needed by the conversion from the data contained in this class. """ def __init__( self, page_id, page_style, metadata_margins=None, metadata_inner_margins=None ): """Initialise the PDFPageMetadata class. Args: page_id (str): the page id page_style (str): the styles as a css string metadata_margins (dict): a dict containing margins read from the metadata file. metadata_inner_margins (dict): a dict containing inner_margins read from the metadata file. """ self.page_number = int(page_id.replace("page", "").replace("-div", "")) style = styles(page_style) self.page_height = int(style.get("height")) self.page_width = int(style.get("width")) self.metadata_margins = metadata_margins or {} self.metadata_inner_margins = metadata_inner_margins or {} def compute_margins(self): """Compute the margins of a page in pixels. Returns: (dict): a dict containing the four margins in pixels """ margins = { margin: self.compute_margin(margin) for margin in ["right_margin", "left_margin", "top_margin", "bottom_margin"] } return margins def compute_margin(self, margin): """Compute a margin in pixels. Args: margin (str): the name of the margin Returns: (int): an int telling where the margin is on the page. """ coefficient = self.get_coefficient(margin) if margin == "left_margin": return int(coefficient * self.page_width / 100.0) if margin == "right_margin": return int(self.page_width - coefficient * self.page_width / 100.0) if margin == "top_margin": return int(coefficient * self.page_height / 100.0) if margin == "bottom_margin": return int(self.page_height - coefficient * self.page_height / 100.0) def get_coefficient(self, margin): """Get the width of the margin in percent.""" coefficient = 0 if margin in list(self.metadata_margins.keys()): margin_data = self.metadata_margins[margin] if margin_data.get(str(self.page_number)) is not None: coefficient = margin_data[str(self.page_number)] elif margin_data.get("all") is not None: coefficient = margin_data["all"] elif self.page_number % 2 == 0 and margin_data.get("even") is not None: coefficient = margin_data["even"] elif self.page_number % 2 == 1 and margin_data.get("odd") is not None: coefficient = margin_data["odd"] return coefficient def compute_inner_margins(self): """Compute inner margins of the document. Returns: (dict): A dict where the key is the name of the margin and the value is an integer indicating where the margin is on the page. """ margins = { margin.replace("inner_", ""): self.compute_inner_margin(margin) for margin in [ "inner_right_margin", "inner_left_margin", "inner_top_margin", "inner_bottom_margin", ] } if ( margins["bottom_margin"] == self.page_height and margins["top_margin"] == 0 and margins["left_margin"] == 0 and margins["right_margin"] == self.page_width ): margins = {} return margins def compute_inner_margin(self, margin): """Compute a margin in pixels. Args: margin (str): the name of the margin Returns: (int): an int telling where the margin is on the page. """ coefficient = self.get_inner_coefficient(margin) if margin == "inner_left_margin": return int(coefficient * self.page_width / 100.0) if margin == "inner_right_margin": return int(self.page_width - coefficient * self.page_width / 100.0) if margin == "inner_top_margin": return int(coefficient * self.page_height / 100.0) if margin == "inner_bottom_margin": return int(self.page_height - coefficient * self.page_height / 100.0) def get_inner_coefficient(self, margin): """Get the width of the margin in percent.""" coefficient = 0 if margin in list(self.metadata_inner_margins.keys()): margin_data = self.metadata_inner_margins[margin] if margin_data.get(str(self.page_number)) is not None: coefficient = margin_data[str(self.page_number)] elif margin_data.get("all") is not None: coefficient = margin_data["all"] elif self.page_number % 2 == 0 and margin_data.get("even") is not None: coefficient = margin_data["even"] elif self.page_number % 2 == 1 and margin_data.get("odd") is not None: coefficient = margin_data["odd"] return coefficient class PDFPage: """Reads a page element. Attributes: textelements (list of PDFTextElements): contains the text of the page pdf_pagemetadata (PDFPageMetadata): contains the metadata of the page The textelements are manipulated in several ways, then ordered in the way they appear on the page and finally sent to PDFTextExtractor """ def __init__( self, page_element, metadata_margins=None, metadata_inner_margins=None, linespacing=None, ): """Initialise the PDFPage class. Args: page_element (etree.Element): an etree element representing a pdf page metadata_margins (dict): a dict containing margins read from the metadata file. metadata_inner_margins (dict): a dict containing inner_margins read from the metadata file. """ self.page_element = page_element self.pdf_pagemetadata = PDFPageMetadata( page_id=page_element.get("id"), page_style=page_element.get("style"), metadata_margins=metadata_margins, metadata_inner_margins=metadata_inner_margins, ) def is_skip_page(self, skip_pages): """Found out if this page should be skipped. Args: skip_pages (list of mixed): list of the pages that should be skipped. Returns: (bool): True if this page should be skipped, otherwise false. """ return ( ("odd" in skip_pages and (self.pdf_pagemetadata.page_number % 2) == 1) or ("even" in skip_pages and (self.pdf_pagemetadata.page_number % 2) == 0) or self.pdf_pagemetadata.page_number in skip_pages ) @property def linespacing(self): """Return linespacing.""" if self.linespacing_dict.get("all"): return self.linespacing_dict["all"] elif self.linespacing_dict.get("even") and ( (self.pdf_pagemetadata.page_number % 2) == 0 ): return self.linespacing_dict["even"] elif self.linespacing_dict.get("odd") and ( (self.pdf_pagemetadata.page_number % 2) == 1 ): return self.linespacing_dict["odd"] elif self.linespacing_dict.get(self.pdf_pagemetadata.page_number): return self.linespacing_dict[self.pdf_pagemetadata.page_number] else: return 1.5 def fix_font_id(self, pdffontspecs): """Fix font id in text elements. Sometimes the same font has different ID's. Correct that ID if necessary. Args: pdffontspecs (PDFFontspecs): a PDFFontspecs instance. """ for textelement in self.textelements: correct = pdffontspecs.corrected_id(textelement.font) textelement.text_elt.set("font", correct) def remove_elements_outside_margin(self): """Remove PDFTextElements from textelements if needed.""" margins = self.pdf_pagemetadata.compute_margins() inner_margins = self.pdf_pagemetadata.compute_inner_margins() self.textelements[:] = [ t for t in self.textelements if self.is_inside_margins(t, margins) ] if inner_margins: self.textelements[:] = [ t for t in self.textelements if not self.is_inside_inner_margins(t, inner_margins) ] @staticmethod def is_inside_margins(text, margins): """Check if t is inside the given margins. t is a text element """ if not margins: return False style = styles(text.get("style")) top = int(style.get("top")) left = int(style.get("left")) return ( margins["top_margin"] < top < margins["bottom_margin"] and margins["left_margin"] < left < margins["right_margin"] ) def pick_valid_text_elements(self): """Pick the wanted text elements from a page. This is the main function of this class """ margins = self.pdf_pagemetadata.compute_margins() inner_margins = self.pdf_pagemetadata.compute_inner_margins() for paragraph in self.page_element.iter("p"): if self.is_inside_margins( paragraph, margins ) and not self.is_inside_margins(paragraph, inner_margins): yield deepcopy(paragraph) class PDF2XMLConverter(basicconverter.BasicConverter): """Class to convert the xml output of pdftohtml to Giella xml. Attributes: extractor (PDFTextExtractor): class to extract text from the xml that pdftohtml produces. pdffontspecs (PDFFontspecs): class to store fontspecs found in the xml pages. """ def __init__(self, filename): """Initialise the PDF2XMLConverte class. Args: filename (str): the path to the pdf file. """ super().__init__(filename) self.pdffontspecs = PDFFontspecs() @staticmethod def strip_chars(content, extra=""): """Strip unwanted chars from the document. Args: content (str): the xml document that pdftohtml produces extra (str): more character that should be removed Returns: (str): containing the modified version of the document. """ remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]") content, _ = remove_re.subn("", content) # Microsoft Word PDF's have Latin-1 file names in links; we # don't actually need any link attributes: content = re.sub("]+>", "", content) return content @staticmethod def replace_ligatures(content): """Replace unwanted strings with correct replacements. Args: content (str): content of an xml document. Returns: (str): String containing the new content of the xml document. """ replacements = { "[dstrok]": "đ", "[Dstrok]": "Đ", "[tstrok]": "ŧ", "[Tstrok]": "Ŧ", "[scaron]": "š", "[Scaron]": "Š", "[zcaron]": "ž", "[Zcaron]": "Ž", "[ccaron]": "č", "[Ccaron]": "Č", "[eng": "ŋ", " ]": "", "Ď": "đ", # cough "ď": "đ", # cough "fi": "fi", "fl": "fl", "ff": "ff", "ffi": "ffi", "ffl": "ffl", "ſt": "ft", } for key, value in replacements.items(): content = content.replace(key + " ", value) content = content.replace(key, value) return content def convert2intermediate(self): """Convert from pdf to a corpus xml file. Returns: (lxml.etree.Element): A corpus xml etree with the content of the pdf file, but without most of the metadata. """ command = ( "pdftohtml -hidden -enc UTF-8 -stdout -nodrm -i -s " f"-wbt {self.metadata.get_variable('word_break_threshold')} {self.orig}" ) pdftohtmloutput = self.extract_text(command.split()) return self.pdftohtml2intermediate(pdftohtmloutput) @staticmethod def possibly_add_to_body(body, this_p): if this_p.text or len(this_p): body.append(this_p) def pdftohtml2intermediate(self, pdftohtmloutput): """Convert output of pdftohtml to a corpus xml file. Returns: (lxml.etree.Element): A corpus xml etree with the content of the pdf file, but without most of the metadata. """ pdf_content = self.split_by_br( self.replace_ligatures(self.strip_chars(pdftohtmloutput)) ) document = etree.Element("html") body = etree.SubElement(document, "body") try: parser = etree.HTMLParser() root_element = etree.fromstring(pdf_content.encode("utf8"), parser=parser) except etree.XMLSyntaxError as error: self.handle_syntaxerror(error, util.lineno(), pdf_content) this_p = etree.Element("p") for paragraph in self.parse_pages(root_element): text = paragraph.xpath("string()").strip() if text: if text[0] != text[0].lower(): self.possibly_add_to_body(body, merge_children_of_p(this_p)) this_p = etree.Element("p") this_p = merge(this_p, paragraph) self.possibly_add_to_body(body, merge_children_of_p(this_p)) return document def pdftohtml2html(self, pdftohtmloutput): """Convert output of pdftohtml to html (applying our regular fixes) Returns: (str): An html file as string with the content of the pdf file, but without most of the metadata. """ doc = self.pdftohtml2intermediate(pdftohtmloutput) meta = etree.Element("meta") meta.attrib["charset"] = "utf-8" doc.insert(0, meta) list(map(doc.remove, doc.findall("header"))) doc.tag = "html" lang = self.metadata.get_variable("mainlang") if lang is None or lang == "": lang = "se" doc.attrib["lang"] = lang return etree.tostring(doc, encoding="utf8", method="html", pretty_print=True) def parse_page(self, page): """Parse the page element. Args: page (Any): a pdf xml page element. """ try: pdfpage = PDFPage( page, metadata_margins=self.metadata.margins, metadata_inner_margins=self.metadata.inner_margins, linespacing=self.metadata.linespacing, ) if not pdfpage.is_skip_page(self.metadata.skip_pages): # pdfpage.fix_font_id(self.pdffontspecs) yield from pdfpage.pick_valid_text_elements() except xslsetter.XsltError as error: raise util.ConversionError(str(error)) def parse_pages(self, root_element): """Parse the pages of the pdf xml document. Args: root_element (xml.etree.Element): the root element of the pdf2xml document. """ return ( paragraph for page in root_element.xpath('//div[starts-with(@id, "page")]') for paragraph in self.parse_page(page) ) def add_fontspecs(self, page): """Extract font specs found in a pdf2xml page element. Args: page (etree.Element): a pdf page """ for xmlfontspec in page.iter("fontspec"): self.pdffontspecs.add_fontspec(xmlfontspec) def split_by_br(self, text): brs = text.replace(" ", " ").split("
") if len(brs) == 1: return text strings = [ handle_br(brs[index], current) for index, current in enumerate(brs[1:]) ] strings.append(brs[-1]) return "".join(strings) def extract_text(self, command): """Extract the text from a document. Args: command (list[str]): a list containing the command and the arguments sent to ExternalCommandRunner. Returns: (bytes): byte string containing the output of the program """ runner = util.ExternalCommandRunner() runner.run(command, cwd="/tmp") if runner.returncode != 0: logname = str(self.orig) + ".log" with open(logname, "w") as logfile: print(f"stdout\n{runner.stdout}\n", file=logfile) print(f"stderr\n{runner.stderr}\n", file=logfile) raise util.ConversionError( f"{command[0]} failed. More info in the log file: {logname}" ) return runner.stdout.decode("utf8") def handle_syntaxerror(self, error, lineno, invalid_input): """Handle an xml syntax error. Args: error (Exception): an exception lineno (int): the line number in this module where the error happened. invalid_input (str): a string containing the invalid input. """ with open(self.orig + ".log", "w") as logfile: logfile.write(f"Error at: {lineno}") for entry in error.error_log: logfile.write(f"\n{str(entry.line)}: {str(entry.column)} ") try: logfile.write(entry.message) except ValueError: logfile.write(entry.message.encode("latin1")) logfile.write("\n") logfile.write(invalid_input) raise util.ConversionError( "{}: log is found in {}".format(type(self).__name__, self.orig + ".log") ) def to_html_elt(path): """Convert a pdf document to the Giella xml format. Args: path (str): path to the document Returns: (lxml.etree.Element): the root element of the Giella xml document """ converter = PDF2XMLConverter(path) return converter.convert2intermediate()