import os import tempfile from pathlib import Path from typing import Tuple, Optional from io import BytesIO import logging import re from docx import Document as DocxDocument from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter, A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak from reportlab.lib.units import inch from weasyprint import HTML, CSS import mammoth logger = logging.getLogger(__name__) class FileConverter: @staticmethod def get_file_extension(filename: str) -> str: return Path(filename).suffix.lower() @staticmethod def convert_docx_to_pdf_reportlab(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]: try: with tempfile.TemporaryDirectory() as temp_dir: # Saves docx docx_path = os.path.join(temp_dir, "temp.docx") with open(docx_path, "wb") as f: f.write(docx_content) # Reads docx doc = DocxDocument(docx_path) # Creates pdf pdf_path = os.path.join(temp_dir, "output.pdf") FileConverter._create_pdf_from_docx(doc, pdf_path) # Reads pdf with open(pdf_path, "rb") as f: pdf_content = f.read() # Creates filename new_filename = original_filename.replace('.docx', '.pdf') return pdf_content, new_filename except Exception as e: return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename) @staticmethod def convert_docx_to_pdf_mammoth(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]: try: with tempfile.TemporaryDirectory() as temp_dir: # Saves docx docx_path = os.path.join(temp_dir, "temp.docx") with open(docx_path, "wb") as f: f.write(docx_content) # Converts in HTML with open(docx_path, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html_content = result.value # Creates HTML full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document") # Converts to PDF pdf_bytes = FileConverter._html_to_pdf(full_html) new_filename = original_filename.replace('.docx', '.pdf') return pdf_bytes, new_filename except Exception as e: raise Exception(f"Impossible to convert from DOCX to PDF: {str(e)}") @staticmethod def _create_pdf_from_docx(docx_doc, output_path: str): doc = SimpleDocTemplate(output_path, pagesize=A4) styles = getSampleStyleSheet() story = [] # Custom styles title_style = ParagraphStyle( 'CustomTitle', parent=styles['Heading1'], fontSize=16, spaceAfter=12, textColor='black' ) normal_style = ParagraphStyle( 'CustomNormal', parent=styles['Normal'], fontSize=11, spaceAfter=6, textColor='black' ) for paragraph in docx_doc.paragraphs: if paragraph.text.strip(): if len(paragraph.text) < 100 and paragraph.text.isupper(): style = title_style elif paragraph.runs and paragraph.runs[0].bold: style = title_style else: style = normal_style p = Paragraph(paragraph.text, style) story.append(p) story.append(Spacer(1, 6)) if not story: story.append(Paragraph("DOCX converted", normal_style)) # Costruisci il PDF doc.build(story) @staticmethod def convert_latex_to_pdf(latex_content: bytes, original_filename: str) -> Tuple[bytes, str]: try: # Decodes LaTeX latex_text = latex_content.decode('utf-8', errors='ignore') # Converts to HTML html_content = FileConverter._latex_to_html_advanced(latex_text) # Converts to PDF pdf_bytes = FileConverter._html_to_pdf(html_content) # Creates filename new_filename = original_filename.replace('.tex', '.pdf').replace('.latex', '.pdf') return pdf_bytes, new_filename except Exception as e: raise Exception(f"Impossibile to convert from LaTeX to PDF: {str(e)}") @staticmethod def _latex_to_html_advanced(latex_text: str) -> str: html = latex_text html = re.sub(r'\\documentclass(?:\[[^\]]*\])?\{[^}]*\}', '', html) html = re.sub(r'\\usepackage(?:\[[^\]]*\])?\{[^}]*\}', '', html) html = re.sub(r'\\begin\{document\}', '', html) html = re.sub(r'\\end\{document\}', '', html) html = re.sub(r'\\maketitle', '', html) html = re.sub(r'\\title\{([^}]*)\}', r'

\1

', html) html = re.sub(r'\\author\{([^}]*)\}', r'

Autore: \1

', html) html = re.sub(r'\\date\{([^}]*)\}', r'

Data: \1

', html) html = re.sub(r'\\section\*?\{([^}]*)\}', r'

\1

', html) html = re.sub(r'\\subsection\*?\{([^}]*)\}', r'

\1

', html) html = re.sub(r'\\subsubsection\*?\{([^}]*)\}', r'

\1

', html) html = re.sub(r'\\paragraph\{([^}]*)\}', r'
\1
', html) html = re.sub(r'\\textbf\{([^}]*)\}', r'\1', html) html = re.sub(r'\\textit\{([^}]*)\}', r'\1', html) html = re.sub(r'\\emph\{([^}]*)\}', r'\1', html) html = re.sub(r'\\underline\{([^}]*)\}', r'\1', html) html = re.sub(r'\\texttt\{([^}]*)\}', r'\1', html) html = re.sub(r'\$\$([^$]+)\$\$', r'
\1
', html) html = re.sub(r'\$([^$]+)\$', r'\1', html) html = re.sub(r'\\begin\{itemize\}', '', html) html = re.sub(r'\\begin\{enumerate\}', '
    ', html) html = re.sub(r'\\end\{enumerate\}', '
', html) html = re.sub(r'\\item(?:\[[^\]]*\])?\s*', '
  • ', html) html = re.sub(r'\\begin\{quote\}', '
    ', html) html = re.sub(r'\\end\{quote\}', '
    ', html) html = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '
    [Figura]
    ', html, flags=re.DOTALL) html = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '
    [Tabella]
    ', html, flags=re.DOTALL) html = re.sub(r'\\[a-zA-Z]+(?:\[[^\]]*\])?\{[^}]*\}', '', html) html = re.sub(r'\\[a-zA-Z]+', '', html) html = re.sub(r'\\\\', '
    ', html) html = re.sub(r'\n\s*\n', '

    ', html) html = re.sub(r'\s+', ' ', html) html = html.strip() return FileConverter._wrap_html_with_styles(html, "LaTeX Document") @staticmethod def _wrap_html_with_styles(content: str, title: str) -> str: html_template = f""" {title}

    {content}
    """ return html_template @staticmethod def _html_to_pdf(html_content: str) -> bytes: try: # Creates PDF html_doc = HTML(string=html_content) pdf_bytes = html_doc.write_pdf() return pdf_bytes except Exception as e: raise Exception(f"Impossible to convert from HTML to PDF: {str(e)}") @staticmethod def convert_to_pdf_if_needed(file_content: bytes, filename: str) -> Tuple[bytes, str]: extension = FileConverter.get_file_extension(filename) if extension == '.pdf': return file_content, filename elif extension == '.docx': return FileConverter.convert_docx_to_pdf_mammoth(file_content, filename) elif extension in ['.tex', '.latex']: return FileConverter.convert_latex_to_pdf(file_content, filename) else: raise Exception(f"Format not supported: {extension}") class AdvancedDocxConverter: @staticmethod def convert_docx_with_pandoc(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]: try: import pypandoc with tempfile.TemporaryDirectory() as temp_dir: # Saves DOCX docx_path = os.path.join(temp_dir, "temp.docx") with open(docx_path, "wb") as f: f.write(docx_content) # Converts to HTML html_content = pypandoc.convert_file(docx_path, 'html') full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document") # Converts HTML to PDF pdf_bytes = FileConverter._html_to_pdf(full_html) new_filename = original_filename.replace('.docx', '.pdf') return pdf_bytes, new_filename except ImportError: logger.warning("pypandoc not found for DOCX") return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename) except Exception as e: logger.warning(f"pandoc error in DOCX: {e}, fallback to standard converter") return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)