mirror of
https://github.com/fccapria/scientify.git
synced 2026-01-12 02:36:10 +00:00
Initial release
This commit is contained in:
commit
ae5e4b8873
52 changed files with 17572 additions and 0 deletions
8
backend/.env
Normal file
8
backend/.env
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
POSTGRES_USER=scientify_user
|
||||
POSTGRES_PASSWORD=scientify_pass
|
||||
POSTGRES_DB=scientify_db
|
||||
POSTGRES_HOST=db
|
||||
POSTGRES_PORT=5432
|
||||
|
||||
# Database URL for local development with Docker
|
||||
DATABASE_URL=postgresql+asyncpg://scientify_user:scientify_pass@db:5432/scientify_db
|
||||
25
backend/Dockerfile
Normal file
25
backend/Dockerfile
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
libpq-dev \
|
||||
libffi-dev \
|
||||
libxml2-dev \
|
||||
libxslt1-dev \
|
||||
libcairo2-dev \
|
||||
libpango1.0-dev \
|
||||
libgdk-pixbuf2.0-dev \
|
||||
libgtk-3-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["python", "main.py"]
|
||||
82
backend/app/app.py
Normal file
82
backend/app/app.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import Depends, FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from app.db import User, create_db_and_tables
|
||||
from app.schemas import UserCreate, UserRead, UserUpdate
|
||||
from app.users import auth_backend, current_active_user, fastapi_users
|
||||
from app.upload import router as upload_router
|
||||
from app.download import router as download_router
|
||||
from app.publication_routes import router as publication_router
|
||||
from app.debug_routes import router as debug_router
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
await create_db_and_tables()
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Scientify API",
|
||||
description="API for managing scientific publications",
|
||||
version="1.0.0",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["http://frontend:80"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers for different parts of the application
|
||||
app.include_router(upload_router)
|
||||
app.include_router(download_router)
|
||||
app.include_router(publication_router)
|
||||
app.include_router(debug_router)
|
||||
|
||||
# Authentication and user management routes
|
||||
app.include_router(
|
||||
fastapi_users.get_auth_router(auth_backend), prefix="/auth/jwt", tags=["auth"]
|
||||
)
|
||||
app.include_router(
|
||||
fastapi_users.get_register_router(UserRead, UserCreate),
|
||||
prefix="/auth",
|
||||
tags=["auth"],
|
||||
)
|
||||
app.include_router(
|
||||
fastapi_users.get_reset_password_router(),
|
||||
prefix="/auth",
|
||||
tags=["auth"],
|
||||
)
|
||||
app.include_router(
|
||||
fastapi_users.get_verify_router(UserRead),
|
||||
prefix="/auth",
|
||||
tags=["auth"],
|
||||
)
|
||||
app.include_router(
|
||||
fastapi_users.get_users_router(UserRead, UserUpdate),
|
||||
prefix="/users",
|
||||
tags=["users"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/authenticated-route")
|
||||
async def authenticated_route(user: User = Depends(current_active_user)):
|
||||
return {"message": f"Hello {user.email}!"}
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""
|
||||
Root endpoint for the Scientify API
|
||||
"""
|
||||
return {
|
||||
"message": "Welcome to Scientify API",
|
||||
"description": "The intelligent platform to manage your scientific publications",
|
||||
"documentation": "/docs"
|
||||
}
|
||||
100
backend/app/db.py
Normal file
100
backend/app/db.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
import os
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from fastapi import Depends
|
||||
from fastapi_users.db import SQLAlchemyBaseUserTableUUID, SQLAlchemyUserDatabase
|
||||
|
||||
from sqlalchemy import Column, Integer, String, Table, ForeignKey, LargeBinary, DateTime
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.orm import DeclarativeBase, relationship, sessionmaker
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
import uuid
|
||||
import datetime
|
||||
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://scientify_user:scientify_pass@db:5432/scientify_db")
|
||||
|
||||
if not "+asyncpg" in DATABASE_URL:
|
||||
raise ValueError("DATABASE_URL must use asyncpg driver for async operations. Use postgresql+asyncpg://...")
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
class User(SQLAlchemyBaseUserTableUUID, Base):
|
||||
first_name = Column(String, nullable=True)
|
||||
last_name = Column(String, nullable=True)
|
||||
|
||||
publications = relationship("Publication", back_populates="user")
|
||||
|
||||
|
||||
try:
|
||||
engine = create_async_engine(DATABASE_URL, echo=True)
|
||||
print("Database engine created successfully")
|
||||
except Exception as e:
|
||||
print(f"Error creating database engine: {e}")
|
||||
raise
|
||||
|
||||
async_session_maker = sessionmaker(
|
||||
engine, class_=AsyncSession, expire_on_commit=False
|
||||
)
|
||||
|
||||
|
||||
async def get_db():
|
||||
async with async_session_maker() as session:
|
||||
yield session
|
||||
|
||||
|
||||
async def create_db_and_tables():
|
||||
async with engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
||||
|
||||
async def get_async_session():
|
||||
async with async_session_maker() as session:
|
||||
yield session
|
||||
|
||||
|
||||
async def get_user_db(session: AsyncSession = Depends(get_async_session)):
|
||||
yield SQLAlchemyUserDatabase(session, User)
|
||||
|
||||
|
||||
publication_authors = Table(
|
||||
'publication_authors', Base.metadata,
|
||||
Column('publication_id', Integer, ForeignKey('publications.id', ondelete='CASCADE')),
|
||||
Column('author_id', Integer, ForeignKey('authors.id', ondelete='CASCADE'))
|
||||
)
|
||||
|
||||
publication_keywords = Table(
|
||||
'publication_keywords', Base.metadata,
|
||||
Column('publication_id', Integer, ForeignKey('publications.id', ondelete='CASCADE')),
|
||||
Column('keyword_id', Integer, ForeignKey('keywords.id', ondelete='CASCADE'))
|
||||
)
|
||||
|
||||
|
||||
class Author(Base):
|
||||
__tablename__ = 'authors'
|
||||
id = Column(Integer, primary_key=True)
|
||||
name = Column(String, nullable=False)
|
||||
|
||||
|
||||
class Keyword(Base):
|
||||
__tablename__ = 'keywords'
|
||||
id = Column(Integer, primary_key=True)
|
||||
name = Column(String, nullable=False)
|
||||
|
||||
|
||||
class Publication(Base):
|
||||
__tablename__ = 'publications'
|
||||
id = Column(Integer, primary_key=True)
|
||||
title = Column(String, nullable=False)
|
||||
file = Column(LargeBinary, nullable=False)
|
||||
filename = Column(String)
|
||||
upload_date = Column(DateTime, default=datetime.datetime.utcnow)
|
||||
journal = Column(String, nullable=True)
|
||||
year = Column(Integer, nullable=True)
|
||||
doi = Column(String, nullable=True, unique=True)
|
||||
|
||||
user_id = Column(UUID(as_uuid=True), ForeignKey('user.id'), nullable=False)
|
||||
user = relationship("User", back_populates="publications")
|
||||
authors = relationship('Author', secondary=publication_authors, backref='publications')
|
||||
keywords = relationship('Keyword', secondary=publication_keywords, backref='publications')
|
||||
59
backend/app/debug_routes.py
Normal file
59
backend/app/debug_routes.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
from fastapi import Depends, APIRouter
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Publication, get_db, Keyword, Author
|
||||
|
||||
router = APIRouter(prefix="/debug", tags=["debug"])
|
||||
|
||||
# Debug endpoint to view all publications with complete data
|
||||
@router.get("/publications")
|
||||
async def debug_publications(db: AsyncSession = Depends(get_db)):
|
||||
"""Debug endpoint to view all publications with their complete data"""
|
||||
stmt = select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords),
|
||||
selectinload(Publication.user)
|
||||
).order_by(Publication.upload_date.desc())
|
||||
|
||||
result = await db.execute(stmt)
|
||||
publications = result.scalars().all()
|
||||
|
||||
debug_data = []
|
||||
for pub in publications:
|
||||
debug_data.append({
|
||||
"id": pub.id,
|
||||
"title": pub.title,
|
||||
"authors": [{"id": a.id, "name": a.name} for a in pub.authors],
|
||||
"keywords": [{"id": k.id, "name": k.name} for k in pub.keywords], # 🎯 KEYWORDS!
|
||||
"upload_date": pub.upload_date,
|
||||
"journal": pub.journal,
|
||||
"year": pub.year,
|
||||
"doi": pub.doi,
|
||||
"user_email": pub.user.email if pub.user else None,
|
||||
"user_id": str(pub.user_id) if pub.user_id else None
|
||||
})
|
||||
|
||||
return {
|
||||
"total_publications": len(publications),
|
||||
"publications": debug_data
|
||||
}
|
||||
|
||||
|
||||
# Debug endpoint to view all authors
|
||||
@router.get("/authors")
|
||||
async def debug_authors(db: AsyncSession = Depends(get_db)):
|
||||
"""Debug endpoint to view all authors"""
|
||||
result = await db.execute(select(Author))
|
||||
authors = result.scalars().all()
|
||||
return [{"id": a.id, "name": a.name} for a in authors]
|
||||
|
||||
|
||||
# Debug endpoint to view all keywords
|
||||
@router.get("/keywords")
|
||||
async def debug_keywords(db: AsyncSession = Depends(get_db)):
|
||||
"""🎯 Debug endpoint to view all keywords - THE HEART OF THE SYSTEM!"""
|
||||
result = await db.execute(select(Keyword))
|
||||
keywords = result.scalars().all()
|
||||
return [{"id": k.id, "name": k.name} for k in keywords]
|
||||
26
backend/app/download.py
Normal file
26
backend/app/download.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
|
||||
from app.db import Publication, get_db
|
||||
|
||||
import io
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/download/{publication_id}")
|
||||
async def download_publication(publication_id: int, db: AsyncSession = Depends(get_db)):
|
||||
result = await db.execute(select(Publication).where(Publication.id == publication_id))
|
||||
publication = result.scalar_one_or_none()
|
||||
if not publication:
|
||||
raise HTTPException(status_code=404, detail="Publication not found")
|
||||
file_bytes = publication.file
|
||||
filename = publication.filename or "document.pdf"
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(file_bytes),
|
||||
media_type="application/pdf",
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
)
|
||||
362
backend/app/file_converter.py
Normal file
362
backend/app/file_converter.py
Normal file
|
|
@ -0,0 +1,362 @@
|
|||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import re
|
||||
|
||||
from docx import Document as DocxDocument
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter, A4
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
||||
from reportlab.lib.units import inch
|
||||
from weasyprint import HTML, CSS
|
||||
import mammoth
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileConverter:
|
||||
@staticmethod
|
||||
def get_file_extension(filename: str) -> str:
|
||||
return Path(filename).suffix.lower()
|
||||
|
||||
@staticmethod
|
||||
def convert_docx_to_pdf_reportlab(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Saves docx
|
||||
docx_path = os.path.join(temp_dir, "temp.docx")
|
||||
with open(docx_path, "wb") as f:
|
||||
f.write(docx_content)
|
||||
|
||||
# Reads docx
|
||||
doc = DocxDocument(docx_path)
|
||||
|
||||
# Creates pdf
|
||||
pdf_path = os.path.join(temp_dir, "output.pdf")
|
||||
FileConverter._create_pdf_from_docx(doc, pdf_path)
|
||||
|
||||
# Reads pdf
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf_content = f.read()
|
||||
|
||||
# Creates filename
|
||||
new_filename = original_filename.replace('.docx', '.pdf')
|
||||
|
||||
return pdf_content, new_filename
|
||||
|
||||
except Exception as e:
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
|
||||
|
||||
@staticmethod
|
||||
def convert_docx_to_pdf_mammoth(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Saves docx
|
||||
docx_path = os.path.join(temp_dir, "temp.docx")
|
||||
with open(docx_path, "wb") as f:
|
||||
f.write(docx_content)
|
||||
|
||||
# Converts in HTML
|
||||
with open(docx_path, "rb") as docx_file:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
html_content = result.value
|
||||
|
||||
# Creates HTML
|
||||
full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
|
||||
|
||||
# Converts to PDF
|
||||
pdf_bytes = FileConverter._html_to_pdf(full_html)
|
||||
|
||||
new_filename = original_filename.replace('.docx', '.pdf')
|
||||
|
||||
return pdf_bytes, new_filename
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Impossible to convert from DOCX to PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def _create_pdf_from_docx(docx_doc, output_path: str):
|
||||
doc = SimpleDocTemplate(output_path, pagesize=A4)
|
||||
styles = getSampleStyleSheet()
|
||||
story = []
|
||||
|
||||
# Custom styles
|
||||
title_style = ParagraphStyle(
|
||||
'CustomTitle',
|
||||
parent=styles['Heading1'],
|
||||
fontSize=16,
|
||||
spaceAfter=12,
|
||||
textColor='black'
|
||||
)
|
||||
|
||||
normal_style = ParagraphStyle(
|
||||
'CustomNormal',
|
||||
parent=styles['Normal'],
|
||||
fontSize=11,
|
||||
spaceAfter=6,
|
||||
textColor='black'
|
||||
)
|
||||
|
||||
for paragraph in docx_doc.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
if len(paragraph.text) < 100 and paragraph.text.isupper():
|
||||
style = title_style
|
||||
elif paragraph.runs and paragraph.runs[0].bold:
|
||||
style = title_style
|
||||
else:
|
||||
style = normal_style
|
||||
|
||||
p = Paragraph(paragraph.text, style)
|
||||
story.append(p)
|
||||
story.append(Spacer(1, 6))
|
||||
|
||||
if not story:
|
||||
story.append(Paragraph("DOCX converted", normal_style))
|
||||
|
||||
# Costruisci il PDF
|
||||
doc.build(story)
|
||||
|
||||
@staticmethod
|
||||
def convert_latex_to_pdf(latex_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
# Decodes LaTeX
|
||||
latex_text = latex_content.decode('utf-8', errors='ignore')
|
||||
|
||||
# Converts to HTML
|
||||
html_content = FileConverter._latex_to_html_advanced(latex_text)
|
||||
|
||||
# Converts to PDF
|
||||
pdf_bytes = FileConverter._html_to_pdf(html_content)
|
||||
|
||||
# Creates filename
|
||||
new_filename = original_filename.replace('.tex', '.pdf').replace('.latex', '.pdf')
|
||||
|
||||
return pdf_bytes, new_filename
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Impossibile to convert from LaTeX to PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def _latex_to_html_advanced(latex_text: str) -> str:
|
||||
html = latex_text
|
||||
|
||||
html = re.sub(r'\\documentclass(?:\[[^\]]*\])?\{[^}]*\}', '', html)
|
||||
html = re.sub(r'\\usepackage(?:\[[^\]]*\])?\{[^}]*\}', '', html)
|
||||
html = re.sub(r'\\begin\{document\}', '', html)
|
||||
html = re.sub(r'\\end\{document\}', '', html)
|
||||
html = re.sub(r'\\maketitle', '', html)
|
||||
|
||||
html = re.sub(r'\\title\{([^}]*)\}', r'<h1 class="title">\1</h1>', html)
|
||||
html = re.sub(r'\\author\{([^}]*)\}', r'<p class="author"><strong>Autore:</strong> \1</p>', html)
|
||||
html = re.sub(r'\\date\{([^}]*)\}', r'<p class="date"><strong>Data:</strong> \1</p>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\section\*?\{([^}]*)\}', r'<h2>\1</h2>', html)
|
||||
html = re.sub(r'\\subsection\*?\{([^}]*)\}', r'<h3>\1</h3>', html)
|
||||
html = re.sub(r'\\subsubsection\*?\{([^}]*)\}', r'<h4>\1</h4>', html)
|
||||
html = re.sub(r'\\paragraph\{([^}]*)\}', r'<h5>\1</h5>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\textbf\{([^}]*)\}', r'<strong>\1</strong>', html)
|
||||
html = re.sub(r'\\textit\{([^}]*)\}', r'<em>\1</em>', html)
|
||||
html = re.sub(r'\\emph\{([^}]*)\}', r'<em>\1</em>', html)
|
||||
html = re.sub(r'\\underline\{([^}]*)\}', r'<u>\1</u>', html)
|
||||
html = re.sub(r'\\texttt\{([^}]*)\}', r'<code>\1</code>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\$\$([^$]+)\$\$', r'<div class="math-block">\1</div>', html)
|
||||
html = re.sub(r'\$([^$]+)\$', r'<span class="math-inline">\1</span>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{itemize\}', '<ul>', html)
|
||||
html = re.sub(r'\\end\{itemize\}', '</ul>', html)
|
||||
html = re.sub(r'\\begin\{enumerate\}', '<ol>', html)
|
||||
html = re.sub(r'\\end\{enumerate\}', '</ol>', html)
|
||||
html = re.sub(r'\\item(?:\[[^\]]*\])?\s*', '<li>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{quote\}', '<blockquote>', html)
|
||||
html = re.sub(r'\\end\{quote\}', '</blockquote>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '<div class="figure">[Figura]</div>', html,
|
||||
flags=re.DOTALL)
|
||||
|
||||
|
||||
html = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '<div class="table">[Tabella]</div>', html, flags=re.DOTALL)
|
||||
|
||||
|
||||
html = re.sub(r'\\[a-zA-Z]+(?:\[[^\]]*\])?\{[^}]*\}', '', html)
|
||||
html = re.sub(r'\\[a-zA-Z]+', '', html)
|
||||
|
||||
|
||||
html = re.sub(r'\\\\', '<br>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\n\s*\n', '</p><p>', html)
|
||||
|
||||
|
||||
html = re.sub(r'\s+', ' ', html)
|
||||
html = html.strip()
|
||||
|
||||
return FileConverter._wrap_html_with_styles(html, "LaTeX Document")
|
||||
|
||||
@staticmethod
|
||||
def _wrap_html_with_styles(content: str, title: str) -> str:
|
||||
html_template = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{title}</title>
|
||||
<style>
|
||||
@page {{
|
||||
size: A4;
|
||||
margin: 2cm;
|
||||
}}
|
||||
body {{
|
||||
font-family: 'Times New Roman', serif;
|
||||
font-size: 12pt;
|
||||
line-height: 1.6;
|
||||
text-align: justify;
|
||||
color: #000;
|
||||
}}
|
||||
.title {{
|
||||
font-size: 20pt;
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
margin-bottom: 16pt;
|
||||
}}
|
||||
.author, .date {{
|
||||
text-align: center;
|
||||
margin-bottom: 12pt;
|
||||
font-style: italic;
|
||||
}}
|
||||
h1, h2 {{
|
||||
font-size: 16pt;
|
||||
font-weight: bold;
|
||||
margin-top: 20pt;
|
||||
margin-bottom: 12pt;
|
||||
}}
|
||||
h3 {{
|
||||
font-size: 14pt;
|
||||
font-weight: bold;
|
||||
margin-top: 16pt;
|
||||
margin-bottom: 10pt;
|
||||
}}
|
||||
h4, h5 {{
|
||||
font-size: 12pt;
|
||||
font-weight: bold;
|
||||
margin-top: 12pt;
|
||||
margin-bottom: 8pt;
|
||||
}}
|
||||
p {{
|
||||
margin-bottom: 12pt;
|
||||
text-indent: 0;
|
||||
}}
|
||||
ul, ol {{
|
||||
margin-bottom: 12pt;
|
||||
padding-left: 30pt;
|
||||
}}
|
||||
li {{
|
||||
margin-bottom: 6pt;
|
||||
}}
|
||||
blockquote {{
|
||||
margin: 12pt 20pt;
|
||||
padding: 8pt;
|
||||
border-left: 3pt solid #ccc;
|
||||
font-style: italic;
|
||||
}}
|
||||
code {{
|
||||
font-family: 'Courier New', monospace;
|
||||
background-color: #f5f5f5;
|
||||
padding: 2pt;
|
||||
}}
|
||||
.math-block {{
|
||||
text-align: center;
|
||||
margin: 12pt 0;
|
||||
font-family: 'Times New Roman', serif;
|
||||
}}
|
||||
.math-inline {{
|
||||
font-family: 'Times New Roman', serif;
|
||||
}}
|
||||
.figure, .table {{
|
||||
text-align: center;
|
||||
margin: 20pt 0;
|
||||
padding: 10pt;
|
||||
border: 1pt solid #ccc;
|
||||
background-color: #f9f9f9;
|
||||
}}
|
||||
strong {{ font-weight: bold; }}
|
||||
em {{ font-style: italic; }}
|
||||
u {{ text-decoration: underline; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>{content}</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return html_template
|
||||
|
||||
@staticmethod
|
||||
def _html_to_pdf(html_content: str) -> bytes:
|
||||
try:
|
||||
# Creates PDF
|
||||
html_doc = HTML(string=html_content)
|
||||
pdf_bytes = html_doc.write_pdf()
|
||||
|
||||
return pdf_bytes
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Impossible to convert from HTML to PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def convert_to_pdf_if_needed(file_content: bytes, filename: str) -> Tuple[bytes, str]:
|
||||
extension = FileConverter.get_file_extension(filename)
|
||||
|
||||
if extension == '.pdf':
|
||||
return file_content, filename
|
||||
elif extension == '.docx':
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(file_content, filename)
|
||||
elif extension in ['.tex', '.latex']:
|
||||
return FileConverter.convert_latex_to_pdf(file_content, filename)
|
||||
else:
|
||||
raise Exception(f"Format not supported: {extension}")
|
||||
|
||||
|
||||
class AdvancedDocxConverter:
|
||||
@staticmethod
|
||||
def convert_docx_with_pandoc(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
|
||||
try:
|
||||
import pypandoc
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Saves DOCX
|
||||
docx_path = os.path.join(temp_dir, "temp.docx")
|
||||
with open(docx_path, "wb") as f:
|
||||
f.write(docx_content)
|
||||
|
||||
# Converts to HTML
|
||||
html_content = pypandoc.convert_file(docx_path, 'html')
|
||||
|
||||
full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
|
||||
|
||||
# Converts HTML to PDF
|
||||
pdf_bytes = FileConverter._html_to_pdf(full_html)
|
||||
|
||||
new_filename = original_filename.replace('.docx', '.pdf')
|
||||
|
||||
return pdf_bytes, new_filename
|
||||
|
||||
except ImportError:
|
||||
logger.warning("pypandoc not found for DOCX")
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
|
||||
except Exception as e:
|
||||
logger.warning(f"pandoc error in DOCX: {e}, fallback to standard converter")
|
||||
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
|
||||
233
backend/app/publication_routes.py
Normal file
233
backend/app/publication_routes.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
from fastapi import Depends, APIRouter, Query, HTTPException
|
||||
from sqlalchemy import select, or_, and_, asc, desc
|
||||
from sqlalchemy.orm import selectinload
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from typing import List, Optional
|
||||
|
||||
from app.db import Publication, get_db, Keyword, Author, User
|
||||
from app.schemas import PublicationOut, UserPublicationOut
|
||||
from app.users import current_active_user
|
||||
|
||||
# Create router for publication endpoints
|
||||
router = APIRouter()
|
||||
|
||||
# Endpoint to delete a publication
|
||||
@router.delete("/publications/{publication_id}")
|
||||
async def delete_publication(
|
||||
publication_id: int,
|
||||
user: User = Depends(current_active_user),
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Delete a publication owned by the current user
|
||||
"""
|
||||
# Find the publication with relations
|
||||
result = await db.execute(
|
||||
select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords)
|
||||
).where(
|
||||
and_(
|
||||
Publication.id == publication_id,
|
||||
Publication.user_id == user.id # Security: only user's own publications
|
||||
)
|
||||
)
|
||||
)
|
||||
publication = result.scalar_one_or_none()
|
||||
|
||||
if not publication:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Publication not found or you don't have permission to delete it"
|
||||
)
|
||||
|
||||
publication_title = publication.title
|
||||
|
||||
# Delete the publication (many-to-many relations are deleted automatically)
|
||||
await db.delete(publication)
|
||||
await db.commit()
|
||||
|
||||
print(f"🗑️ Publication deleted: '{publication_title}' (ID: {publication_id}) by user {user.email}")
|
||||
|
||||
return {"message": f"Publication '{publication_title}' successfully deleted"}
|
||||
|
||||
|
||||
# Endpoint for user publications with sorting
|
||||
@router.get("/users/me/publications", response_model=List[UserPublicationOut])
|
||||
async def get_user_publications(
|
||||
order_by: Optional[str] = Query("date_desc",
|
||||
description="Sort by: date_asc, date_desc, title_asc, title_desc"),
|
||||
user: User = Depends(current_active_user),
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Returns all publications uploaded by the current user with sorting
|
||||
"""
|
||||
stmt = select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords)
|
||||
).where(
|
||||
Publication.user_id == user.id
|
||||
)
|
||||
|
||||
# Sorting management
|
||||
if order_by == "date_asc":
|
||||
stmt = stmt.order_by(asc(Publication.upload_date))
|
||||
elif order_by == "date_desc":
|
||||
stmt = stmt.order_by(desc(Publication.upload_date))
|
||||
elif order_by == "title_asc":
|
||||
stmt = stmt.order_by(asc(Publication.title))
|
||||
elif order_by == "title_desc":
|
||||
stmt = stmt.order_by(desc(Publication.title))
|
||||
else:
|
||||
# Default: descending by date (most recent first)
|
||||
stmt = stmt.order_by(desc(Publication.upload_date))
|
||||
|
||||
result = await db.execute(stmt)
|
||||
publications = result.scalars().all()
|
||||
|
||||
print(f"🔍 User {user.email} (ID: {user.id}) has {len(publications)} publications (sorted by: {order_by})")
|
||||
|
||||
return publications
|
||||
|
||||
|
||||
# Search publications endpoint
|
||||
@router.get("/publications", response_model=List[PublicationOut])
|
||||
async def get_publications(
|
||||
search: Optional[str] = Query(None,
|
||||
description="Search by title, author or keyword. For multiple keywords use spaces: 'keyword1 keyword2'"),
|
||||
order_by: Optional[str] = Query("date_desc",
|
||||
description="Sort by: date_asc, date_desc, title_asc, title_desc"),
|
||||
db: AsyncSession = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
🔍 ADVANCED SEARCH SYSTEM WITH KEYWORDS
|
||||
|
||||
Search function with priority and sorting:
|
||||
1. Keywords (highest priority) - supports multiple search with spaces
|
||||
2. Authors (medium priority)
|
||||
3. Title (lowest priority)
|
||||
|
||||
Keywords are the core of the search system!
|
||||
"""
|
||||
|
||||
print(f"🔍 Search: '{search}' | Sort by: {order_by}")
|
||||
|
||||
# If no search query, return all sorted
|
||||
if search is None or not search.strip():
|
||||
stmt = select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords)
|
||||
)
|
||||
|
||||
# Sorting management
|
||||
if order_by == "date_asc":
|
||||
stmt = stmt.order_by(asc(Publication.upload_date))
|
||||
elif order_by == "date_desc":
|
||||
stmt = stmt.order_by(desc(Publication.upload_date))
|
||||
elif order_by == "title_asc":
|
||||
stmt = stmt.order_by(asc(Publication.title))
|
||||
elif order_by == "title_desc":
|
||||
stmt = stmt.order_by(desc(Publication.title))
|
||||
else:
|
||||
# Default: descending by date
|
||||
stmt = stmt.order_by(desc(Publication.upload_date))
|
||||
|
||||
result = await db.execute(stmt)
|
||||
return result.scalars().all()
|
||||
|
||||
search_term = search.strip()
|
||||
|
||||
# Split search string into individual keywords
|
||||
search_keywords = [kw.strip().lower() for kw in search_term.split() if kw.strip()]
|
||||
print(f"🔍 Keywords to search: {search_keywords}")
|
||||
|
||||
# SET to track already found IDs
|
||||
found_publication_ids = set()
|
||||
final_results = []
|
||||
|
||||
# 🎯 1. SEARCH BY KEYWORDS (highest priority) - MULTIPLE SEARCH
|
||||
if search_keywords:
|
||||
print("🔍 Step 1: Searching by multiple keywords...")
|
||||
|
||||
# Create conditions for each keyword
|
||||
keyword_conditions = []
|
||||
for keyword in search_keywords:
|
||||
keyword_pattern = f"%{keyword}%"
|
||||
keyword_conditions.append(
|
||||
Publication.keywords.any(Keyword.name.ilike(keyword_pattern))
|
||||
)
|
||||
|
||||
# Publication must have ALL keywords (AND)
|
||||
keyword_query = select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords)
|
||||
).where(
|
||||
and_(*keyword_conditions) # All conditions must be true
|
||||
)
|
||||
|
||||
keyword_result = await db.execute(keyword_query)
|
||||
keyword_publications = keyword_result.scalars().all()
|
||||
|
||||
for pub in keyword_publications:
|
||||
if pub.id not in found_publication_ids:
|
||||
final_results.append(pub)
|
||||
found_publication_ids.add(pub.id)
|
||||
pub_keywords = [k.name for k in pub.keywords]
|
||||
print(f" ✅ Found by keywords: {pub.title} (keywords: {pub_keywords})")
|
||||
|
||||
# 📝 2. SEARCH BY AUTHORS (medium priority) - uses complete string
|
||||
print("🔍 Step 2: Searching by authors...")
|
||||
author_pattern = f"%{search_term}%"
|
||||
author_query = select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords)
|
||||
).join(Publication.authors).where(
|
||||
Author.name.ilike(author_pattern)
|
||||
)
|
||||
|
||||
author_result = await db.execute(author_query)
|
||||
author_publications = author_result.scalars().all()
|
||||
|
||||
for pub in author_publications:
|
||||
if pub.id not in found_publication_ids:
|
||||
final_results.append(pub)
|
||||
found_publication_ids.add(pub.id)
|
||||
pub_authors = [a.name for a in pub.authors]
|
||||
print(f" ✅ Found by author: {pub.title} (authors: {pub_authors})")
|
||||
|
||||
# 📰 3. SEARCH BY TITLE (lowest priority) - uses complete string
|
||||
print("🔍 Step 3: Searching by title...")
|
||||
title_pattern = f"%{search_term}%"
|
||||
title_query = select(Publication).options(
|
||||
selectinload(Publication.authors),
|
||||
selectinload(Publication.keywords)
|
||||
).where(
|
||||
Publication.title.ilike(title_pattern)
|
||||
)
|
||||
|
||||
title_result = await db.execute(title_query)
|
||||
title_publications = title_result.scalars().all()
|
||||
|
||||
for pub in title_publications:
|
||||
if pub.id not in found_publication_ids:
|
||||
final_results.append(pub)
|
||||
found_publication_ids.add(pub.id)
|
||||
print(f" ✅ Found by title: {pub.title}")
|
||||
|
||||
# Apply sorting to final results
|
||||
print(f"🔍 Applying sorting: {order_by}")
|
||||
if order_by == "date_asc":
|
||||
final_results.sort(key=lambda x: x.upload_date)
|
||||
elif order_by == "date_desc":
|
||||
final_results.sort(key=lambda x: x.upload_date, reverse=True)
|
||||
elif order_by == "title_asc":
|
||||
final_results.sort(key=lambda x: x.title.lower())
|
||||
elif order_by == "title_desc":
|
||||
final_results.sort(key=lambda x: x.title.lower(), reverse=True)
|
||||
else:
|
||||
# Default: descending by date
|
||||
final_results.sort(key=lambda x: x.upload_date, reverse=True)
|
||||
|
||||
print(f"🔍 Total results found: {len(final_results)}")
|
||||
return final_results
|
||||
69
backend/app/schemas.py
Normal file
69
backend/app/schemas.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
import uuid
|
||||
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi_users import schemas
|
||||
|
||||
|
||||
class UserRead(schemas.BaseUser[uuid.UUID]):
|
||||
first_name: Optional[str] = None
|
||||
last_name: Optional[str] = None
|
||||
|
||||
|
||||
class UserCreate(schemas.BaseUserCreate):
|
||||
first_name: Optional[str] = None
|
||||
last_name: Optional[str] = None
|
||||
|
||||
|
||||
class UserUpdate(schemas.BaseUserUpdate):
|
||||
first_name: Optional[str] = None
|
||||
last_name: Optional[str] = None
|
||||
|
||||
|
||||
class AuthorOut(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class KeywordOut(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class PublicationOut(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
filename: Optional[str]
|
||||
upload_date: datetime
|
||||
journal: Optional[str] = None
|
||||
year: Optional[int] = None
|
||||
doi: Optional[str] = None
|
||||
authors: List[AuthorOut]
|
||||
keywords: List[KeywordOut]
|
||||
user_id: Optional[uuid.UUID] = None
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class UserPublicationOut(BaseModel):
|
||||
id: int
|
||||
title: str
|
||||
filename: Optional[str]
|
||||
upload_date: datetime
|
||||
journal: Optional[str] = None
|
||||
year: Optional[int] = None
|
||||
doi: Optional[str] = None
|
||||
authors: List[AuthorOut]
|
||||
keywords: List[KeywordOut]
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
253
backend/app/upload.py
Normal file
253
backend/app/upload.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import joinedload
|
||||
|
||||
from app.db import Publication, Author, Keyword, User, get_db
|
||||
from app.utils import parser, nlp
|
||||
from app.users import current_active_user
|
||||
from app.file_converter import FileConverter, AdvancedDocxConverter
|
||||
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/upload/")
|
||||
async def upload_publication(
|
||||
file: UploadFile = File(...),
|
||||
bibtex: Optional[UploadFile] = File(None),
|
||||
title: Optional[str] = Form(None),
|
||||
authors: Optional[str] = Form(None),
|
||||
year: Optional[int] = Form(None),
|
||||
journal: Optional[str] = Form(None),
|
||||
doi: Optional[str] = Form(None),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
user: User = Depends(current_active_user)
|
||||
):
|
||||
try:
|
||||
bibtex_metadata = None
|
||||
|
||||
if bibtex is not None:
|
||||
try:
|
||||
bibtex_content = (await bibtex.read()).decode("utf-8")
|
||||
b_title, b_authors, b_year, b_journal, b_doi = parser.bibtex(bibtex_content)
|
||||
bibtex_metadata = {
|
||||
"title": b_title,
|
||||
"authors": b_authors,
|
||||
"year": b_year,
|
||||
"journal": b_journal,
|
||||
"doi": b_doi
|
||||
}
|
||||
|
||||
title = title or b_title
|
||||
authors = authors or b_authors
|
||||
year = year or b_year
|
||||
journal = journal or b_journal
|
||||
doi = doi or b_doi
|
||||
|
||||
logger.info(f"BibTeX processed. Metadatas extracted: {bibtex_metadata}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Parsing BibTeX error: {e}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Parsing BibTeX error: {str(e)}"
|
||||
)
|
||||
|
||||
if doi and not is_valid_doi(doi):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="DOI invalid. Use this format: 10.xxxx/xxxxx"
|
||||
)
|
||||
|
||||
if doi:
|
||||
existing_doi = await db.execute(
|
||||
select(Publication).where(Publication.doi == doi)
|
||||
)
|
||||
if existing_doi.scalar_one_or_none():
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="DOI existing"
|
||||
)
|
||||
if bibtex is None:
|
||||
missing_fields = []
|
||||
if not title: missing_fields.append("title")
|
||||
if not authors: missing_fields.append("authors")
|
||||
if not year: missing_fields.append("year")
|
||||
if not journal: missing_fields.append("journal")
|
||||
|
||||
if missing_fields:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Missing fields: {', '.join(missing_fields)}. "
|
||||
f"Insert fields or upload a BibTeX."
|
||||
)
|
||||
logger.info("Manual mode")
|
||||
else:
|
||||
if not all([title, authors, year, journal]):
|
||||
missing_from_bibtex = []
|
||||
if not title: missing_from_bibtex.append("title")
|
||||
if not authors: missing_from_bibtex.append("authors")
|
||||
if not year: missing_from_bibtex.append("year")
|
||||
if not journal: missing_from_bibtex.append("journal")
|
||||
|
||||
logger.error(f"Missing from BibTeX: {missing_from_bibtex}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Missing fields: {', '.join(missing_from_bibtex)}. "
|
||||
)
|
||||
logger.info("BibTeX mode")
|
||||
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="File needed")
|
||||
|
||||
allowed_extensions = ['.pdf', '.docx', '.tex', '.latex']
|
||||
file_extension = '.' + file.filename.split('.')[-1].lower() if '.' in file.filename else ''
|
||||
if file_extension not in allowed_extensions:
|
||||
logger.error(f"Extension not allowed: {file_extension}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Extension not allowed, please upload these: {', '.join(allowed_extensions)}"
|
||||
)
|
||||
|
||||
content = await file.read()
|
||||
logger.info(f"File uploaded: {file.filename} ({len(content)} bytes)")
|
||||
|
||||
try:
|
||||
file_ext = FileConverter.get_file_extension(file.filename)
|
||||
conversion_method = "none"
|
||||
|
||||
if file_ext == '.docx':
|
||||
try:
|
||||
converted_content, final_filename = AdvancedDocxConverter.convert_docx_with_pandoc(
|
||||
content, file.filename
|
||||
)
|
||||
conversion_method = "pandoc"
|
||||
logger.info(f"DOCX converted pandoc: {file.filename} -> {final_filename}")
|
||||
except Exception as pandoc_error:
|
||||
logger.warning(f"Pandoc failed with DOCX: {pandoc_error}, use mammoth")
|
||||
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
|
||||
content, file.filename
|
||||
)
|
||||
conversion_method = "mammoth"
|
||||
logger.info(f"DOCX converted with mammoth: {file.filename} -> {final_filename}")
|
||||
else:
|
||||
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
|
||||
content, file.filename
|
||||
)
|
||||
conversion_method = "standard" if file_ext in ['.tex', '.latex'] else "none"
|
||||
logger.info(f"File processed: {file.filename} -> {final_filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while converting the file: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error while converting the file: {str(e)}"
|
||||
)
|
||||
|
||||
try:
|
||||
text = parser.extract_text(file.filename, content)
|
||||
keywords = nlp.extract_keywords(text)
|
||||
logger.info(f"{len(keywords)} keywords extracted")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error while extracting keywords: {e}")
|
||||
keywords = []
|
||||
|
||||
author_names = [a.strip() for a in authors.split(",") if a.strip()]
|
||||
keyword_names = [k.strip().lower() for k in keywords if k.strip()]
|
||||
|
||||
logger.info(f"Authors to process: {author_names}")
|
||||
logger.info(f"Keywords to process: {keyword_names}")
|
||||
|
||||
author_objs = []
|
||||
for name in author_names:
|
||||
result = await db.execute(select(Author).where(Author.name == name))
|
||||
author = result.scalar_one_or_none()
|
||||
if not author:
|
||||
author = Author(name=name)
|
||||
db.add(author)
|
||||
await db.flush()
|
||||
logger.info(f"New author created: {name}")
|
||||
else:
|
||||
logger.info(f"Existing author found: {name}")
|
||||
author_objs.append(author)
|
||||
|
||||
keyword_objs = []
|
||||
for kw in keyword_names:
|
||||
result = await db.execute(select(Keyword).where(Keyword.name == kw))
|
||||
keyword = result.scalar_one_or_none()
|
||||
if not keyword:
|
||||
keyword = Keyword(name=kw)
|
||||
db.add(keyword)
|
||||
await db.flush()
|
||||
logger.info(f"Keyword created: {kw}")
|
||||
else:
|
||||
logger.info(f"Existing keyword found: {kw}")
|
||||
keyword_objs.append(keyword)
|
||||
|
||||
publication = Publication(
|
||||
title=title,
|
||||
file=converted_content,
|
||||
filename=final_filename,
|
||||
journal=journal,
|
||||
year=year,
|
||||
doi=doi,
|
||||
user_id=user.id,
|
||||
authors=author_objs,
|
||||
keywords=keyword_objs
|
||||
)
|
||||
db.add(publication)
|
||||
await db.commit()
|
||||
await db.refresh(publication)
|
||||
|
||||
result = await db.execute(
|
||||
select(Publication)
|
||||
.options(joinedload(Publication.authors), joinedload(Publication.keywords))
|
||||
.where(Publication.id == publication.id)
|
||||
)
|
||||
publication_with_rel = result.unique().scalar_one()
|
||||
|
||||
author_names_response = [author.name for author in publication_with_rel.authors]
|
||||
keyword_names_response = [kw.name for kw in publication_with_rel.keywords]
|
||||
|
||||
response_data = {
|
||||
"id": publication_with_rel.id,
|
||||
"title": publication_with_rel.title,
|
||||
"authors": author_names_response,
|
||||
"keywords": keyword_names_response,
|
||||
"journal": publication_with_rel.journal,
|
||||
"year": publication_with_rel.year,
|
||||
"doi": publication_with_rel.doi,
|
||||
"original_filename": file.filename,
|
||||
"converted_filename": final_filename,
|
||||
"conversion_method": conversion_method
|
||||
}
|
||||
|
||||
if bibtex is not None:
|
||||
response_data["metadata_source"] = "bibtex"
|
||||
response_data["bibtex_data"] = bibtex_metadata
|
||||
logger.info("Saved with BibTeX metadata")
|
||||
else:
|
||||
response_data["metadata_source"] = "manual"
|
||||
logger.info("Saved with classical metadata")
|
||||
|
||||
return response_data
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Upload error: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
def is_valid_doi(doi: str) -> bool:
|
||||
import re
|
||||
doi_pattern = r'^10\.\d{4,}/[-._;()/:\w\[\]]+$'
|
||||
return bool(re.match(doi_pattern, doi, re.IGNORECASE))
|
||||
56
backend/app/users.py
Normal file
56
backend/app/users.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import Depends, Request
|
||||
from fastapi_users import BaseUserManager, FastAPIUsers, UUIDIDMixin, models
|
||||
from fastapi_users.authentication import (
|
||||
AuthenticationBackend,
|
||||
BearerTransport,
|
||||
JWTStrategy,
|
||||
)
|
||||
from fastapi_users.db import SQLAlchemyUserDatabase
|
||||
|
||||
from app.db import User, get_user_db
|
||||
|
||||
#CHANGE ME
|
||||
SECRET = "1d90d4315c0a0313fb65211fa82e88129cddedb8b662553fbd38f44be9dc818bbd8623ca0177d965e762ee9727b5f6a2bd98481311ecccbcae846bff4f57b8ce72a51fca3278caa05ff18e54c563788d2a67b44be6fc667c12d1b6c2d869f6637b67025a6aa938e811616f27c160a13dc7b653e56a9823f61a165cdf671f734c"
|
||||
|
||||
|
||||
class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
reset_password_token_secret = SECRET
|
||||
verification_token_secret = SECRET
|
||||
|
||||
async def on_after_register(self, user: User, request: Optional[Request] = None):
|
||||
print(f"User {user.id} has registered.")
|
||||
|
||||
async def on_after_forgot_password(
|
||||
self, user: User, token: str, request: Optional[Request] = None
|
||||
):
|
||||
print(f"User {user.id} has forgot their password. Reset token: {token}")
|
||||
|
||||
async def on_after_request_verify(
|
||||
self, user: User, token: str, request: Optional[Request] = None
|
||||
):
|
||||
print(f"Verification requested for user {user.id}. Verification token: {token}")
|
||||
|
||||
|
||||
async def get_user_manager(user_db: SQLAlchemyUserDatabase = Depends(get_user_db)):
|
||||
yield UserManager(user_db)
|
||||
|
||||
|
||||
bearer_transport = BearerTransport(tokenUrl="auth/jwt/login")
|
||||
|
||||
|
||||
def get_jwt_strategy() -> JWTStrategy[models.UP, models.ID]:
|
||||
return JWTStrategy(secret=SECRET, lifetime_seconds=3600)
|
||||
|
||||
|
||||
auth_backend = AuthenticationBackend(
|
||||
name="jwt",
|
||||
transport=bearer_transport,
|
||||
get_strategy=get_jwt_strategy,
|
||||
)
|
||||
|
||||
fastapi_users = FastAPIUsers[User, uuid.UUID](get_user_manager, [auth_backend])
|
||||
|
||||
current_active_user = fastapi_users.current_user(active=True)
|
||||
6
backend/app/utils/nlp.py
Normal file
6
backend/app/utils/nlp.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import yake
|
||||
|
||||
def extract_keywords(text: str, num_keywords: int = 5) -> list:
|
||||
kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=num_keywords)
|
||||
keywords = kw_extractor.extract_keywords(text)
|
||||
return [kw for kw, _ in keywords]
|
||||
165
backend/app/utils/parser.py
Normal file
165
backend/app/utils/parser.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
import bibtexparser
|
||||
import io
|
||||
import logging
|
||||
from typing import Tuple, Optional
|
||||
from pdfminer.high_level import extract_text as pdf_extract_text
|
||||
from pdfminer.high_level import extract_text_to_fp
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def bibtex(bibtex_content: str) -> Tuple[Optional[str], Optional[str], Optional[int], Optional[str], Optional[str]]:
|
||||
"""
|
||||
Estrae title, authors, year, journal, doi dal primo record di un file bibtex.
|
||||
Ritorna una tupla (title, authors, year, journal, doi).
|
||||
"""
|
||||
bib_database = bibtexparser.load(io.StringIO(bibtex_content))
|
||||
if not bib_database.entries:
|
||||
return (None, None, None, None, None)
|
||||
|
||||
entry = bib_database.entries[0]
|
||||
title = entry.get('title')
|
||||
authors = entry.get('authors') # o 'author' se il campo è diverso
|
||||
year = int(entry['year']) if 'year' in entry else None
|
||||
journal = entry.get('journal')
|
||||
doi = entry.get('doi')
|
||||
|
||||
return (title, authors, year, journal, doi)
|
||||
|
||||
|
||||
def extract_text(filename: str, content: bytes) -> str:
|
||||
"""
|
||||
🎯 FUNZIONE FONDAMENTALE: Estrae testo da file PDF per l'analisi delle keywords
|
||||
|
||||
Args:
|
||||
filename: Nome del file (per determinare il tipo)
|
||||
content: Contenuto del file in bytes
|
||||
|
||||
Returns:
|
||||
str: Testo estratto dal documento
|
||||
"""
|
||||
try:
|
||||
# Determina l'estensione del file
|
||||
file_extension = os.path.splitext(filename.lower())[1]
|
||||
|
||||
if file_extension == '.pdf':
|
||||
return extract_text_from_pdf(content)
|
||||
elif file_extension == '.docx':
|
||||
return extract_text_from_docx(content)
|
||||
elif file_extension in ['.tex', '.latex']:
|
||||
return extract_text_from_latex(content)
|
||||
else:
|
||||
logger.warning(f"Tipo di file non supportato per estrazione testo: {file_extension}")
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Errore nell'estrazione del testo da {filename}: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_pdf(pdf_content: bytes) -> str:
|
||||
"""
|
||||
Estrae testo da contenuto PDF usando pdfminer
|
||||
"""
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
|
||||
temp_file.write(pdf_content)
|
||||
temp_file.flush()
|
||||
|
||||
# Estrae il testo usando pdfminer
|
||||
text = pdf_extract_text(temp_file.name)
|
||||
|
||||
# Pulisce il file temporaneo
|
||||
os.unlink(temp_file.name)
|
||||
|
||||
logger.info(f"Estratto testo PDF: {len(text)} caratteri")
|
||||
return text or ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Errore nell'estrazione testo da PDF: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_docx(docx_content: bytes) -> str:
|
||||
"""
|
||||
Estrae testo da contenuto DOCX usando python-docx
|
||||
"""
|
||||
try:
|
||||
from docx import Document
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as temp_file:
|
||||
temp_file.write(docx_content)
|
||||
temp_file.flush()
|
||||
|
||||
# Estrae il testo usando python-docx
|
||||
doc = Document(temp_file.name)
|
||||
text_parts = []
|
||||
|
||||
for paragraph in doc.paragraphs:
|
||||
text_parts.append(paragraph.text)
|
||||
|
||||
text = '\n'.join(text_parts)
|
||||
|
||||
# Pulisce il file temporaneo
|
||||
os.unlink(temp_file.name)
|
||||
|
||||
logger.info(f"Estratto testo DOCX: {len(text)} caratteri")
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Errore nell'estrazione testo da DOCX: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_from_latex(latex_content: bytes) -> str:
|
||||
"""
|
||||
Estrae testo da contenuto LaTeX rimuovendo i comandi LaTeX
|
||||
"""
|
||||
try:
|
||||
from pylatexenc.latex2text import LatexNodes2Text
|
||||
|
||||
# Decodifica il contenuto
|
||||
latex_text = latex_content.decode('utf-8', errors='ignore')
|
||||
|
||||
# Converte LaTeX in testo semplice
|
||||
converter = LatexNodes2Text()
|
||||
text = converter.latex_to_text(latex_text)
|
||||
|
||||
logger.info(f"Estratto testo LaTeX: {len(text)} caratteri")
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Errore nell'estrazione testo da LaTeX: {e}")
|
||||
# Fallback: rimuovi manualmente i comandi LaTeX più comuni
|
||||
try:
|
||||
latex_text = latex_content.decode('utf-8', errors='ignore')
|
||||
# Rimuove comandi LaTeX di base
|
||||
import re
|
||||
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', latex_text)
|
||||
text = re.sub(r'\\[a-zA-Z]+', '', text)
|
||||
text = re.sub(r'\{[^}]*\}', '', text)
|
||||
text = re.sub(r'%.*', '', text) # Rimuove commenti
|
||||
return text.strip()
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
def clean_extracted_text(text: str) -> str:
|
||||
"""
|
||||
Pulisce il testo estratto per migliorare l'estrazione delle keywords
|
||||
"""
|
||||
import re
|
||||
|
||||
# Rimuove caratteri di controllo e spazi multipli
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# Rimuove caratteri speciali eccessivi
|
||||
text = re.sub(r'[^\w\s\-.,;:()[\]{}]', ' ', text)
|
||||
|
||||
# Rimuove linee molto corte (probabilmente header/footer)
|
||||
lines = text.split('\n')
|
||||
clean_lines = [line.strip() for line in lines if len(line.strip()) > 10]
|
||||
|
||||
return '\n'.join(clean_lines).strip()
|
||||
14
backend/docker-compose.yml
Normal file
14
backend/docker-compose.yml
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
services:
|
||||
db:
|
||||
container_name: pg
|
||||
image: postgres:15-alpine
|
||||
env_file:
|
||||
- ./.env
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
|
||||
9
backend/main.py
Normal file
9
backend/main.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
#from database import save_publication
|
||||
import uvicorn
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("app.app:app", host="0.0.0.0", log_level="info")
|
||||
34
backend/requirements.txt
Normal file
34
backend/requirements.txt
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#backend core
|
||||
fastapi[all]
|
||||
fastapi-asyncpg
|
||||
fastapi-users[sqlalchemy,postgresql]
|
||||
|
||||
#database drivers
|
||||
asyncpg
|
||||
|
||||
#parser
|
||||
python-multipart
|
||||
|
||||
#web server
|
||||
uvicorn
|
||||
|
||||
#utils
|
||||
pdfminer.six
|
||||
python-docx
|
||||
pylatexenc
|
||||
bibtexparser
|
||||
|
||||
#NLP
|
||||
yake
|
||||
|
||||
#file conversion
|
||||
python-docx
|
||||
reportlab
|
||||
weasyprint
|
||||
markdown
|
||||
mammoth
|
||||
pypandoc-binary
|
||||
|
||||
boto3
|
||||
|
||||
python-dotenv
|
||||
Loading…
Add table
Add a link
Reference in a new issue