# -*- coding:utf-8 -*- # Copyright © 2020-2024 UiT The Arctic University of Norway # License: GPL3 # noqa: ERA001 # Author: Børre Gaup """Parser for divvun-runtime output.""" import json import re from typing import Any def strip_ansi_codes(text: str) -> str: """Remove ANSI color codes from text. ANSI color codes follow the pattern: - ESC[m where ESC is \x1b or \033 - Can also have format like [48;2;R;G;Bm for 24-bit colors Args: text: String potentially containing ANSI escape codes Returns: Clean string without ANSI codes """ # Pattern matches: # \x1b or \033 (ESC character) # \[ (opening bracket) # [0-9;]+ (parameters like 48;2;239;241;245) # [a-zA-Z] (final character like 'm', 'K', etc.) ansi_escape = re.compile(r'\x1b\[[0-9;]*[a-zA-Z]') return ansi_escape.sub('', text) def extract_json_from_runtime_output(output: str) -> str: """Extract clean JSON from divvun-runtime output. divvun-runtime may output: - ANSI color codes for pretty printing - Log messages to stderr (should be filtered by subprocess) - The actual JSON response Args: output: Raw output from divvun-runtime Returns: Clean JSON string """ # First, strip ANSI color codes clean_output = strip_ansi_codes(output) # Find the JSON object boundaries json_start = clean_output.find('{') if json_start == -1: return "{}" # Count braces to find the complete JSON object brace_count = 0 json_end = -1 for i in range(json_start, len(clean_output)): if clean_output[i] == '{': brace_count += 1 elif clean_output[i] == '}': brace_count -= 1 if brace_count == 0: json_end = i + 1 break if json_end == -1: return "{}" return clean_output[json_start:json_end] def parse_runtime_response(output: str) -> dict[str, Any]: """Parse divvun-runtime JSON response. Args: output: Raw output from divvun-runtime command Returns: Parsed JSON as dictionary with keys: text, errors, encoding Returns empty dict on parse error """ json_str = extract_json_from_runtime_output(output) try: return json.loads(json_str) except json.JSONDecodeError: return {"text": "", "errors": [], "encoding": "utf-8"} def split_runtime_output_by_lines(runtime_response: dict[str, Any]) -> list[dict[str, Any]]: """Split divvun-runtime output into separate results for each line. divvun-runtime treats entire input as one text block, but divvun-checker processes each line separately. This function splits the runtime output to match checker behavior. IMPORTANT: This function must convert byte offsets to character offsets before splitting, since line boundaries are defined by character positions. Args: runtime_response: Parsed JSON from divvun-runtime with keys: - text: Full input text (may contain \n) - errors: List of error objects with byte offsets - encoding: Text encoding Returns: List of dictionaries, one per line, each with: - text: Single line of text - errors: Errors for that line only (with character offsets adjusted to line start) """ text = runtime_response.get("text", "") all_errors = runtime_response.get("errors", []) # First, convert all error offsets from bytes to characters char_errors = [] for error in all_errors: byte_start = error.get("start", 0) byte_end = error.get("end", 0) # Convert byte offsets to character offsets char_start = byte_offset_to_char_offset(text, byte_start) char_end = byte_offset_to_char_offset(text, byte_end) # Create new error with character offsets char_error = error.copy() char_error["start"] = char_start char_error["end"] = char_end char_errors.append(char_error) # Split text by newlines lines = text.split("\n") results = [] # Calculate line boundaries in character offsets line_start = 0 for line_text in lines: line_end = line_start + len(line_text) # Find errors that fall within this line line_errors = [] for error in char_errors: error_start = error.get("start", 0) error_end = error.get("end", 0) # Check if error is within this line's boundaries if line_start <= error_start < line_end and error_end <= line_end: # Adjust error positions relative to line start adjusted_error = error.copy() adjusted_error["start"] = error_start - line_start adjusted_error["end"] = error_end - line_start line_errors.append(adjusted_error) results.append({ "text": line_text, "errors": line_errors }) # Move to next line (add 1 for the \n character) line_start = line_end + 1 return results def byte_offset_to_char_offset(text: str, byte_offset: int) -> int: """Convert byte offset to character offset in UTF-8 text. Args: text: The text string byte_offset: Byte offset in UTF-8 encoding Returns: Character offset (position in the string) """ # Encode to bytes and take the substring up to byte_offset byte_text = text.encode('utf-8') # Decode the substring to get character count char_text = byte_text[:byte_offset].decode('utf-8', errors='ignore') return len(char_text) def convert_runtime_error_to_checker_format(error: dict[str, Any]) -> list[Any]: """Convert a divvun-runtime error object to divvun-checker format. NOTE: This function now expects errors with character offsets (already converted). Runtime format: { "form": "leam", "start": 4, # character offset (already converted from bytes) "end": 8, # character offset (already converted from bytes) "error_id": "err-typo", "title": "Spelling error", "description": "Not in the dictionary.", "suggestions": [] } Checker format (list): ["leam", 4, 8, "typo", "Ii leat sátnelisttus", [], "Čállinmeattáhus"] [form, start, end, error_type, description, suggestions, title] Args: error: Error dict from divvun-runtime (with character offsets) Returns: List in divvun-checker format """ # Remove "err-" prefix from error_id to get error type error_id = error.get("error_id", "") error_type = error_id.replace("err-", "") if error_id.startswith("err-") else error_id return [ error.get("form", ""), error.get("start", 0), # Already character offset error.get("end", 0), # Already character offset error_type, error.get("description", ""), error.get("suggestions", []), error.get("title", "") ] def convert_runtime_to_checker_format(runtime_response: dict[str, Any]) -> list[dict[str, Any]]: """Convert divvun-runtime response to divvun-checker format. Converts from runtime's single-response format to checker's line-by-line format. Args: runtime_response: Parsed JSON from divvun-runtime Returns: List of dicts in divvun-checker format, one per line: {"text": "line text", "errs": [[form, start, end, type, desc, suggs, title], ...]} """ # Split by lines and convert byte offsets to character offsets line_results = split_runtime_output_by_lines(runtime_response) checker_format = [] for line_result in line_results: # Convert errors (they already have character offsets from split function) checker_errors = [ convert_runtime_error_to_checker_format(err) for err in line_result["errors"] ] checker_format.append({ "text": line_result["text"], "errs": checker_errors }) return checker_format def runtime_output_to_checker_json_lines(output: str) -> str: """Convert raw divvun-runtime output to divvun-checker JSON lines format. This is the main conversion function that takes raw runtime output and returns newline-separated JSON objects matching checker format. Args: output: Raw output from divvun-runtime command (may include ANSI codes) Returns: String with one JSON object per line, matching divvun-checker format """ # Parse runtime output runtime_response = parse_runtime_response(output) # Convert to checker format checker_results = convert_runtime_to_checker_format(runtime_response) # Convert to JSON lines (one JSON object per line) json_lines = [json.dumps(result, ensure_ascii=False) for result in checker_results] # Ensure each line ends with newline return "\n".join(json_lines) + "\n" if json_lines else ""