mirror of
https://github.com/fccapria/scientify.git
synced 2026-01-12 02:36:10 +00:00
Initial release
This commit is contained in:
commit
ae5e4b8873
52 changed files with 17572 additions and 0 deletions
362
backend/app/file_converter.py
Normal file
362
backend/app/file_converter.py
Normal file
|
|
@ -0,0 +1,362 @@
|
|||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import re
|
||||
|
||||
from docx import Document as DocxDocument
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter, A4
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
||||
from reportlab.lib.units import inch
|
||||
from weasyprint import HTML, CSS
|
||||
import mammoth
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileConverter:
|
||||
@staticmethod
|
||||
def get_file_extension(filename: str) -> str:
|
||||
return Path(filename).suffix.lower()
|
||||
|
||||
@staticmethod
|
||||
def convert_docx_to_pdf_reportlab(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Saves docx
|
||||
docx_path = os.path.join(temp_dir, "temp.docx")
|
||||
with open(docx_path, "wb") as f:
|
||||
f.write(docx_content)
|
||||
|
||||
# Reads docx
|
||||
doc = DocxDocument(docx_path)
|
||||
|
||||
# Creates pdf
|
||||
pdf_path = os.path.join(temp_dir, "output.pdf")
|
||||
FileConverter._create_pdf_from_docx(doc, pdf_path)
|
||||
|
||||
# Reads pdf
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf_content = f.read()
|
||||
|
||||
# Creates filename
|
||||
new_filename = original_filename.replace('.docx', '.pdf')
|
||||
|
||||
return pdf_content, new_filename
|
||||
|
||||
except Exception as e:
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
|
||||
|
||||
@staticmethod
|
||||
def convert_docx_to_pdf_mammoth(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Saves docx
|
||||
docx_path = os.path.join(temp_dir, "temp.docx")
|
||||
with open(docx_path, "wb") as f:
|
||||
f.write(docx_content)
|
||||
|
||||
# Converts in HTML
|
||||
with open(docx_path, "rb") as docx_file:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
html_content = result.value
|
||||
|
||||
# Creates HTML
|
||||
full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
|
||||
|
||||
# Converts to PDF
|
||||
pdf_bytes = FileConverter._html_to_pdf(full_html)
|
||||
|
||||
new_filename = original_filename.replace('.docx', '.pdf')
|
||||
|
||||
return pdf_bytes, new_filename
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Impossible to convert from DOCX to PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def _create_pdf_from_docx(docx_doc, output_path: str):
|
||||
doc = SimpleDocTemplate(output_path, pagesize=A4)
|
||||
styles = getSampleStyleSheet()
|
||||
story = []
|
||||
|
||||
# Custom styles
|
||||
title_style = ParagraphStyle(
|
||||
'CustomTitle',
|
||||
parent=styles['Heading1'],
|
||||
fontSize=16,
|
||||
spaceAfter=12,
|
||||
textColor='black'
|
||||
)
|
||||
|
||||
normal_style = ParagraphStyle(
|
||||
'CustomNormal',
|
||||
parent=styles['Normal'],
|
||||
fontSize=11,
|
||||
spaceAfter=6,
|
||||
textColor='black'
|
||||
)
|
||||
|
||||
for paragraph in docx_doc.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
if len(paragraph.text) < 100 and paragraph.text.isupper():
|
||||
style = title_style
|
||||
elif paragraph.runs and paragraph.runs[0].bold:
|
||||
style = title_style
|
||||
else:
|
||||
style = normal_style
|
||||
|
||||
p = Paragraph(paragraph.text, style)
|
||||
story.append(p)
|
||||
story.append(Spacer(1, 6))
|
||||
|
||||
if not story:
|
||||
story.append(Paragraph("DOCX converted", normal_style))
|
||||
|
||||
# Costruisci il PDF
|
||||
doc.build(story)
|
||||
|
||||
@staticmethod
|
||||
def convert_latex_to_pdf(latex_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
# Decodes LaTeX
|
||||
latex_text = latex_content.decode('utf-8', errors='ignore')
|
||||
|
||||
# Converts to HTML
|
||||
html_content = FileConverter._latex_to_html_advanced(latex_text)
|
||||
|
||||
# Converts to PDF
|
||||
pdf_bytes = FileConverter._html_to_pdf(html_content)
|
||||
|
||||
# Creates filename
|
||||
new_filename = original_filename.replace('.tex', '.pdf').replace('.latex', '.pdf')
|
||||
|
||||
return pdf_bytes, new_filename
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Impossibile to convert from LaTeX to PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def _latex_to_html_advanced(latex_text: str) -> str:
|
||||
html = latex_text
|
||||
|
||||
html = re.sub(r'\\documentclass(?:\[[^\]]*\])?\{[^}]*\}', '', html)
|
||||
html = re.sub(r'\\usepackage(?:\[[^\]]*\])?\{[^}]*\}', '', html)
|
||||
html = re.sub(r'\\begin\{document\}', '', html)
|
||||
html = re.sub(r'\\end\{document\}', '', html)
|
||||
html = re.sub(r'\\maketitle', '', html)
|
||||
|
||||
html = re.sub(r'\\title\{([^}]*)\}', r'<h1 class="title">\1</h1>', html)
|
||||
html = re.sub(r'\\author\{([^}]*)\}', r'<p class="author"><strong>Autore:</strong> \1</p>', html)
|
||||
html = re.sub(r'\\date\{([^}]*)\}', r'<p class="date"><strong>Data:</strong> \1</p>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\section\*?\{([^}]*)\}', r'<h2>\1</h2>', html)
|
||||
html = re.sub(r'\\subsection\*?\{([^}]*)\}', r'<h3>\1</h3>', html)
|
||||
html = re.sub(r'\\subsubsection\*?\{([^}]*)\}', r'<h4>\1</h4>', html)
|
||||
html = re.sub(r'\\paragraph\{([^}]*)\}', r'<h5>\1</h5>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\textbf\{([^}]*)\}', r'<strong>\1</strong>', html)
|
||||
html = re.sub(r'\\textit\{([^}]*)\}', r'<em>\1</em>', html)
|
||||
html = re.sub(r'\\emph\{([^}]*)\}', r'<em>\1</em>', html)
|
||||
html = re.sub(r'\\underline\{([^}]*)\}', r'<u>\1</u>', html)
|
||||
html = re.sub(r'\\texttt\{([^}]*)\}', r'<code>\1</code>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\$\$([^$]+)\$\$', r'<div class="math-block">\1</div>', html)
|
||||
html = re.sub(r'\$([^$]+)\$', r'<span class="math-inline">\1</span>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{itemize\}', '<ul>', html)
|
||||
html = re.sub(r'\\end\{itemize\}', '</ul>', html)
|
||||
html = re.sub(r'\\begin\{enumerate\}', '<ol>', html)
|
||||
html = re.sub(r'\\end\{enumerate\}', '</ol>', html)
|
||||
html = re.sub(r'\\item(?:\[[^\]]*\])?\s*', '<li>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{quote\}', '<blockquote>', html)
|
||||
html = re.sub(r'\\end\{quote\}', '</blockquote>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '<div class="figure">[Figura]</div>', html,
|
||||
flags=re.DOTALL)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '<div class="table">[Tabella]</div>', html, flags=re.DOTALL)
|
||||
|
||||
|
||||
html = re.sub(r'\\[a-zA-Z]+(?:\[[^\]]*\])?\{[^}]*\}', '', html)
|
||||
html = re.sub(r'\\[a-zA-Z]+', '', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\\\', '<br>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\n\s*\n', '</p><p>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\s+', ' ', html)
|
||||
html = html.strip()
|
||||
|
||||
return FileConverter._wrap_html_with_styles(html, "LaTeX Document")
|
||||
|
||||
@staticmethod
|
||||
def _wrap_html_with_styles(content: str, title: str) -> str:
|
||||
html_template = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{title}</title>
|
||||
<style>
|
||||
@page {{
|
||||
size: A4;
|
||||
margin: 2cm;
|
||||
}}
|
||||
body {{
|
||||
font-family: 'Times New Roman', serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.6;
|
||||
text-align: justify;
|
||||
color: #000;
|
||||
}}
|
||||
.title {{
|
||||
font-size: 20pt;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
margin-bottom: 16pt;
|
||||
}}
|
||||
.author, .date {{
|
||||
text-align: center;
|
||||
margin-bottom: 12pt;
|
||||
font-style: italic;
|
||||
}}
|
||||
h1, h2 {{
|
||||
font-size: 16pt;
|
||||
font-weight: bold;
|
||||
margin-top: 20pt;
|
||||
margin-bottom: 12pt;
|
||||
}}
|
||||
h3 {{
|
||||
font-size: 14pt;
|
||||
font-weight: bold;
|
||||
margin-top: 16pt;
|
||||
margin-bottom: 10pt;
|
||||
}}
|
||||
h4, h5 {{
|
||||
font-size: 12pt;
|
||||
font-weight: bold;
|
||||
margin-top: 12pt;
|
||||
margin-bottom: 8pt;
|
||||
}}
|
||||
p {{
|
||||
margin-bottom: 12pt;
|
||||
text-indent: 0;
|
||||
}}
|
||||
ul, ol {{
|
||||
margin-bottom: 12pt;
|
||||
padding-left: 30pt;
|
||||
}}
|
||||
li {{
|
||||
margin-bottom: 6pt;
|
||||
}}
|
||||
blockquote {{
|
||||
margin: 12pt 20pt;
|
||||
padding: 8pt;
|
||||
border-left: 3pt solid #ccc;
|
||||
font-style: italic;
|
||||
}}
|
||||
code {{
|
||||
font-family: 'Courier New', monospace;
|
||||
background-color: #f5f5f5;
|
||||
padding: 2pt;
|
||||
}}
|
||||
.math-block {{
|
||||
text-align: center;
|
||||
margin: 12pt 0;
|
||||
font-family: 'Times New Roman', serif;
|
||||
}}
|
||||
.math-inline {{
|
||||
font-family: 'Times New Roman', serif;
|
||||
}}
|
||||
.figure, .table {{
|
||||
text-align: center;
|
||||
margin: 20pt 0;
|
||||
padding: 10pt;
|
||||
border: 1pt solid #ccc;
|
||||
background-color: #f9f9f9;
|
||||
}}
|
||||
strong {{ font-weight: bold; }}
|
||||
em {{ font-style: italic; }}
|
||||
u {{ text-decoration: underline; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>{content}</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return html_template
|
||||
|
||||
@staticmethod
|
||||
def _html_to_pdf(html_content: str) -> bytes:
|
||||
try:
|
||||
# Creates PDF
|
||||
html_doc = HTML(string=html_content)
|
||||
pdf_bytes = html_doc.write_pdf()
|
||||
|
||||
return pdf_bytes
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Impossible to convert from HTML to PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def convert_to_pdf_if_needed(file_content: bytes, filename: str) -> Tuple[bytes, str]:
|
||||
extension = FileConverter.get_file_extension(filename)
|
||||
|
||||
if extension == '.pdf':
|
||||
return file_content, filename
|
||||
elif extension == '.docx':
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(file_content, filename)
|
||||
elif extension in ['.tex', '.latex']:
|
||||
return FileConverter.convert_latex_to_pdf(file_content, filename)
|
||||
else:
|
||||
raise Exception(f"Format not supported: {extension}")
|
||||
|
||||
|
||||
class AdvancedDocxConverter:
|
||||
@staticmethod
|
||||
def convert_docx_with_pandoc(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
import pypandoc
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Saves DOCX
|
||||
docx_path = os.path.join(temp_dir, "temp.docx")
|
||||
with open(docx_path, "wb") as f:
|
||||
f.write(docx_content)
|
||||
|
||||
# Converts to HTML
|
||||
html_content = pypandoc.convert_file(docx_path, 'html')
|
||||
|
||||
full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
|
||||
|
||||
# Converts HTML to PDF
|
||||
pdf_bytes = FileConverter._html_to_pdf(full_html)
|
||||
|
||||
new_filename = original_filename.replace('.docx', '.pdf')
|
||||
|
||||
return pdf_bytes, new_filename
|
||||
|
||||
except ImportError:
|
||||
logger.warning("pypandoc not found for DOCX")
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
|
||||
except Exception as e:
|
||||
logger.warning(f"pandoc error in DOCX: {e}, fallback to standard converter")
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
|
||||
Loading…
Add table
Add a link
Reference in a new issue