Initial release

2026-01-12 02:36:10 +00:00 · 2025-06-21 18:15:33 +02:00 · 2025-06-21 18:15:33 +02:00 · ae5e4b8873
commit ae5e4b8873
52 changed files with 17572 additions and 0 deletions
--- a/backend/app/file_converter.py
+++ b/backend/app/file_converter.py
@ -0,0 +1,362 @@
+import os
+import tempfile
+from pathlib import Path
+from typing import Tuple, Optional
+from io import BytesIO
+import logging
+import re
+
+from docx import Document as DocxDocument
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
+from reportlab.lib.units import inch
+from weasyprint import HTML, CSS
+import mammoth
+
+logger = logging.getLogger(__name__)
+
+
+class FileConverter:
+    @staticmethod
+    def get_file_extension(filename: str) -> str:
+        return Path(filename).suffix.lower()
+
+    @staticmethod
+    def convert_docx_to_pdf_reportlab(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Saves docx
+                docx_path = os.path.join(temp_dir, "temp.docx")
+                with open(docx_path, "wb") as f:
+                    f.write(docx_content)
+
+                # Reads docx
+                doc = DocxDocument(docx_path)
+
+                # Creates pdf
+                pdf_path = os.path.join(temp_dir, "output.pdf")
+                FileConverter._create_pdf_from_docx(doc, pdf_path)
+
+                # Reads pdf
+                with open(pdf_path, "rb") as f:
+                    pdf_content = f.read()
+
+                # Creates filename
+                new_filename = original_filename.replace('.docx', '.pdf')
+
+                return pdf_content, new_filename
+
+        except Exception as e:
+            return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
+
+    @staticmethod
+    def convert_docx_to_pdf_mammoth(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Saves docx
+                docx_path = os.path.join(temp_dir, "temp.docx")
+                with open(docx_path, "wb") as f:
+                    f.write(docx_content)
+
+                # Converts in HTML
+                with open(docx_path, "rb") as docx_file:
+                    result = mammoth.convert_to_html(docx_file)
+                    html_content = result.value
+
+                # Creates HTML
+                full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
+
+                # Converts to PDF
+                pdf_bytes = FileConverter._html_to_pdf(full_html)
+
+                new_filename = original_filename.replace('.docx', '.pdf')
+
+                return pdf_bytes, new_filename
+
+        except Exception as e:
+            raise Exception(f"Impossible to convert from DOCX to PDF: {str(e)}")
+
+    @staticmethod
+    def _create_pdf_from_docx(docx_doc, output_path: str):
+        doc = SimpleDocTemplate(output_path, pagesize=A4)
+        styles = getSampleStyleSheet()
+        story = []
+
+        # Custom styles
+        title_style = ParagraphStyle(
+            'CustomTitle',
+            parent=styles['Heading1'],
+            fontSize=16,
+            spaceAfter=12,
+            textColor='black'
+        )
+
+        normal_style = ParagraphStyle(
+            'CustomNormal',
+            parent=styles['Normal'],
+            fontSize=11,
+            spaceAfter=6,
+            textColor='black'
+        )
+
+        for paragraph in docx_doc.paragraphs:
+            if paragraph.text.strip():
+                if len(paragraph.text) < 100 and paragraph.text.isupper():
+                    style = title_style
+                elif paragraph.runs and paragraph.runs[0].bold:
+                    style = title_style
+                else:
+                    style = normal_style
+
+                p = Paragraph(paragraph.text, style)
+                story.append(p)
+                story.append(Spacer(1, 6))
+
+        if not story:
+            story.append(Paragraph("DOCX converted", normal_style))
+
+        # Costruisci il PDF
+        doc.build(story)
+
+    @staticmethod
+    def convert_latex_to_pdf(latex_content: bytes, original_filename: str) -> Tuple[bytes, str]:
+        try:
+            # Decodes LaTeX
+            latex_text = latex_content.decode('utf-8', errors='ignore')
+
+            # Converts to HTML
+            html_content = FileConverter._latex_to_html_advanced(latex_text)
+
+            # Converts to PDF
+            pdf_bytes = FileConverter._html_to_pdf(html_content)
+
+            # Creates filename
+            new_filename = original_filename.replace('.tex', '.pdf').replace('.latex', '.pdf')
+
+            return pdf_bytes, new_filename
+
+        except Exception as e:
+            raise Exception(f"Impossibile to convert from LaTeX to PDF: {str(e)}")
+
+    @staticmethod
+    def _latex_to_html_advanced(latex_text: str) -> str:
+        html = latex_text
+
+        html = re.sub(r'\\documentclass(?:\[[^\]]*\])?\{[^}]*\}', '', html)
+        html = re.sub(r'\\usepackage(?:\[[^\]]*\])?\{[^}]*\}', '', html)
+        html = re.sub(r'\\begin\{document\}', '', html)
+        html = re.sub(r'\\end\{document\}', '', html)
+        html = re.sub(r'\\maketitle', '', html)
+
+        html = re.sub(r'\\title\{([^}]*)\}', r'<h1 class="title">\1</h1>', html)
+        html = re.sub(r'\\author\{([^}]*)\}', r'<p class="author"><strong>Autore:</strong> \1</p>', html)
+        html = re.sub(r'\\date\{([^}]*)\}', r'<p class="date"><strong>Data:</strong> \1</p>', html)
+
+ 
+        html = re.sub(r'\\section\*?\{([^}]*)\}', r'<h2>\1</h2>', html)
+        html = re.sub(r'\\subsection\*?\{([^}]*)\}', r'<h3>\1</h3>', html)
+        html = re.sub(r'\\subsubsection\*?\{([^}]*)\}', r'<h4>\1</h4>', html)
+        html = re.sub(r'\\paragraph\{([^}]*)\}', r'<h5>\1</h5>', html)
+
+
+        html = re.sub(r'\\textbf\{([^}]*)\}', r'<strong>\1</strong>', html)
+        html = re.sub(r'\\textit\{([^}]*)\}', r'<em>\1</em>', html)
+        html = re.sub(r'\\emph\{([^}]*)\}', r'<em>\1</em>', html)
+        html = re.sub(r'\\underline\{([^}]*)\}', r'<u>\1</u>', html)
+        html = re.sub(r'\\texttt\{([^}]*)\}', r'<code>\1</code>', html)
+
+        
+        html = re.sub(r'\$\$([^$]+)\$\$', r'<div class="math-block">\1</div>', html)
+        html = re.sub(r'\$([^$]+)\$', r'<span class="math-inline">\1</span>', html)
+
+        
+        html = re.sub(r'\\begin\{itemize\}', '<ul>', html)
+        html = re.sub(r'\\end\{itemize\}', '</ul>', html)
+        html = re.sub(r'\\begin\{enumerate\}', '<ol>', html)
+        html = re.sub(r'\\end\{enumerate\}', '</ol>', html)
+        html = re.sub(r'\\item(?:\[[^\]]*\])?\s*', '<li>', html)
+
+        
+        html = re.sub(r'\\begin\{quote\}', '<blockquote>', html)
+        html = re.sub(r'\\end\{quote\}', '</blockquote>', html)
+
+        
+        html = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '<div class="figure">[Figura]</div>', html,
+                      flags=re.DOTALL)
+
+        
+        html = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '<div class="table">[Tabella]</div>', html, flags=re.DOTALL)
+
+        
+        html = re.sub(r'\\[a-zA-Z]+(?:\[[^\]]*\])?\{[^}]*\}', '', html)
+        html = re.sub(r'\\[a-zA-Z]+', '', html)
+
+        
+        html = re.sub(r'\\\\', '<br>', html)
+
+        
+        html = re.sub(r'\n\s*\n', '</p><p>', html)
+
+        
+        html = re.sub(r'\s+', ' ', html)
+        html = html.strip()
+
+        return FileConverter._wrap_html_with_styles(html, "LaTeX Document")
+
+    @staticmethod
+    def _wrap_html_with_styles(content: str, title: str) -> str:
+        html_template = f"""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <meta charset="utf-8">
+            <title>{title}</title>
+            <style>
+                @page {{
+                    size: A4;
+                    margin: 2cm;
+                }}
+                body {{
+                    font-family: 'Times New Roman', serif;
+                    font-size: 12pt;
+                    line-height: 1.6;
+                    text-align: justify;
+                    color: #000;
+                }}
+                .title {{
+                    font-size: 20pt;
+                    font-weight: bold;
+                    text-align: center;
+                    margin-bottom: 16pt;
+                }}
+                .author, .date {{
+                    text-align: center;
+                    margin-bottom: 12pt;
+                    font-style: italic;
+                }}
+                h1, h2 {{ 
+                    font-size: 16pt; 
+                    font-weight: bold;
+                    margin-top: 20pt; 
+                    margin-bottom: 12pt;
+                }}
+                h3 {{ 
+                    font-size: 14pt; 
+                    font-weight: bold;
+                    margin-top: 16pt; 
+                    margin-bottom: 10pt;
+                }}
+                h4, h5 {{ 
+                    font-size: 12pt; 
+                    font-weight: bold;
+                    margin-top: 12pt; 
+                    margin-bottom: 8pt;
+                }}
+                p {{ 
+                    margin-bottom: 12pt; 
+                    text-indent: 0;
+                }}
+                ul, ol {{ 
+                    margin-bottom: 12pt; 
+                    padding-left: 30pt;
+                }}
+                li {{ 
+                    margin-bottom: 6pt; 
+                }}
+                blockquote {{
+                    margin: 12pt 20pt;
+                    padding: 8pt;
+                    border-left: 3pt solid #ccc;
+                    font-style: italic;
+                }}
+                code {{
+                    font-family: 'Courier New', monospace;
+                    background-color: #f5f5f5;
+                    padding: 2pt;
+                }}
+                .math-block {{
+                    text-align: center;
+                    margin: 12pt 0;
+                    font-family: 'Times New Roman', serif;
+                }}
+                .math-inline {{
+                    font-family: 'Times New Roman', serif;
+                }}
+                .figure, .table {{
+                    text-align: center;
+                    margin: 20pt 0;
+                    padding: 10pt;
+                    border: 1pt solid #ccc;
+                    background-color: #f9f9f9;
+                }}
+                strong {{ font-weight: bold; }}
+                em {{ font-style: italic; }}
+                u {{ text-decoration: underline; }}
+            </style>
+        </head>
+        <body>
+            <div>{content}</div>
+        </body>
+        </html>
+        """
+
+        return html_template
+
+    @staticmethod
+    def _html_to_pdf(html_content: str) -> bytes:
+        try:
+            # Creates PDF
+            html_doc = HTML(string=html_content)
+            pdf_bytes = html_doc.write_pdf()
+
+            return pdf_bytes
+
+        except Exception as e:
+            raise Exception(f"Impossible to convert from HTML to PDF: {str(e)}")
+
+    @staticmethod
+    def convert_to_pdf_if_needed(file_content: bytes, filename: str) -> Tuple[bytes, str]:
+        extension = FileConverter.get_file_extension(filename)
+
+        if extension == '.pdf':
+            return file_content, filename
+        elif extension == '.docx':
+            return FileConverter.convert_docx_to_pdf_mammoth(file_content, filename)
+        elif extension in ['.tex', '.latex']:
+            return FileConverter.convert_latex_to_pdf(file_content, filename)
+        else:
+            raise Exception(f"Format not supported: {extension}")
+
+
+class AdvancedDocxConverter:
+    @staticmethod
+    def convert_docx_with_pandoc(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
+        try:
+            import pypandoc
+
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Saves DOCX
+                docx_path = os.path.join(temp_dir, "temp.docx")
+                with open(docx_path, "wb") as f:
+                    f.write(docx_content)
+
+                # Converts to HTML
+                html_content = pypandoc.convert_file(docx_path, 'html')
+
+                full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
+
+                # Converts HTML to PDF
+                pdf_bytes = FileConverter._html_to_pdf(full_html)
+
+                new_filename = original_filename.replace('.docx', '.pdf')
+
+                return pdf_bytes, new_filename
+
+        except ImportError:
+            logger.warning("pypandoc not found for DOCX")
+            return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
+        except Exception as e:
+            logger.warning(f"pandoc error in DOCX: {e}, fallback to standard converter")
+            return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)