Initial release

This commit is contained in:
Francesco Carmelo Capria 2025-06-21 18:15:33 +02:00
commit ae5e4b8873
52 changed files with 17572 additions and 0 deletions

82
backend/app/app.py Normal file
View file

@ -0,0 +1,82 @@
from contextlib import asynccontextmanager
from fastapi import Depends, FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.db import User, create_db_and_tables
from app.schemas import UserCreate, UserRead, UserUpdate
from app.users import auth_backend, current_active_user, fastapi_users
from app.upload import router as upload_router
from app.download import router as download_router
from app.publication_routes import router as publication_router
from app.debug_routes import router as debug_router
@asynccontextmanager
async def lifespan(app: FastAPI):
await create_db_and_tables()
yield
app = FastAPI(
title="Scientify API",
description="API for managing scientific publications",
version="1.0.0",
lifespan=lifespan
)
app.add_middleware(
CORSMiddleware,
allow_origins=["http://frontend:80"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers for different parts of the application
app.include_router(upload_router)
app.include_router(download_router)
app.include_router(publication_router)
app.include_router(debug_router)
# Authentication and user management routes
app.include_router(
fastapi_users.get_auth_router(auth_backend), prefix="/auth/jwt", tags=["auth"]
)
app.include_router(
fastapi_users.get_register_router(UserRead, UserCreate),
prefix="/auth",
tags=["auth"],
)
app.include_router(
fastapi_users.get_reset_password_router(),
prefix="/auth",
tags=["auth"],
)
app.include_router(
fastapi_users.get_verify_router(UserRead),
prefix="/auth",
tags=["auth"],
)
app.include_router(
fastapi_users.get_users_router(UserRead, UserUpdate),
prefix="/users",
tags=["users"],
)
@app.get("/authenticated-route")
async def authenticated_route(user: User = Depends(current_active_user)):
return {"message": f"Hello {user.email}!"}
@app.get("/")
async def root():
"""
Root endpoint for the Scientify API
"""
return {
"message": "Welcome to Scientify API",
"description": "The intelligent platform to manage your scientific publications",
"documentation": "/docs"
}

100
backend/app/db.py Normal file
View file

@ -0,0 +1,100 @@
import os
from collections.abc import AsyncGenerator
from fastapi import Depends
from fastapi_users.db import SQLAlchemyBaseUserTableUUID, SQLAlchemyUserDatabase
from sqlalchemy import Column, Integer, String, Table, ForeignKey, LargeBinary, DateTime
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.orm import DeclarativeBase, relationship, sessionmaker
from sqlalchemy.dialects.postgresql import UUID
import uuid
import datetime
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql+asyncpg://scientify_user:scientify_pass@db:5432/scientify_db")
if not "+asyncpg" in DATABASE_URL:
raise ValueError("DATABASE_URL must use asyncpg driver for async operations. Use postgresql+asyncpg://...")
class Base(DeclarativeBase):
pass
class User(SQLAlchemyBaseUserTableUUID, Base):
first_name = Column(String, nullable=True)
last_name = Column(String, nullable=True)
publications = relationship("Publication", back_populates="user")
try:
engine = create_async_engine(DATABASE_URL, echo=True)
print("Database engine created successfully")
except Exception as e:
print(f"Error creating database engine: {e}")
raise
async_session_maker = sessionmaker(
engine, class_=AsyncSession, expire_on_commit=False
)
async def get_db():
async with async_session_maker() as session:
yield session
async def create_db_and_tables():
async with engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
async def get_async_session():
async with async_session_maker() as session:
yield session
async def get_user_db(session: AsyncSession = Depends(get_async_session)):
yield SQLAlchemyUserDatabase(session, User)
publication_authors = Table(
'publication_authors', Base.metadata,
Column('publication_id', Integer, ForeignKey('publications.id', ondelete='CASCADE')),
Column('author_id', Integer, ForeignKey('authors.id', ondelete='CASCADE'))
)
publication_keywords = Table(
'publication_keywords', Base.metadata,
Column('publication_id', Integer, ForeignKey('publications.id', ondelete='CASCADE')),
Column('keyword_id', Integer, ForeignKey('keywords.id', ondelete='CASCADE'))
)
class Author(Base):
__tablename__ = 'authors'
id = Column(Integer, primary_key=True)
name = Column(String, nullable=False)
class Keyword(Base):
__tablename__ = 'keywords'
id = Column(Integer, primary_key=True)
name = Column(String, nullable=False)
class Publication(Base):
__tablename__ = 'publications'
id = Column(Integer, primary_key=True)
title = Column(String, nullable=False)
file = Column(LargeBinary, nullable=False)
filename = Column(String)
upload_date = Column(DateTime, default=datetime.datetime.utcnow)
journal = Column(String, nullable=True)
year = Column(Integer, nullable=True)
doi = Column(String, nullable=True, unique=True)
user_id = Column(UUID(as_uuid=True), ForeignKey('user.id'), nullable=False)
user = relationship("User", back_populates="publications")
authors = relationship('Author', secondary=publication_authors, backref='publications')
keywords = relationship('Keyword', secondary=publication_keywords, backref='publications')

View file

@ -0,0 +1,59 @@
from fastapi import Depends, APIRouter
from sqlalchemy import select
from sqlalchemy.orm import selectinload
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Publication, get_db, Keyword, Author
router = APIRouter(prefix="/debug", tags=["debug"])
# Debug endpoint to view all publications with complete data
@router.get("/publications")
async def debug_publications(db: AsyncSession = Depends(get_db)):
"""Debug endpoint to view all publications with their complete data"""
stmt = select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords),
selectinload(Publication.user)
).order_by(Publication.upload_date.desc())
result = await db.execute(stmt)
publications = result.scalars().all()
debug_data = []
for pub in publications:
debug_data.append({
"id": pub.id,
"title": pub.title,
"authors": [{"id": a.id, "name": a.name} for a in pub.authors],
"keywords": [{"id": k.id, "name": k.name} for k in pub.keywords], # 🎯 KEYWORDS!
"upload_date": pub.upload_date,
"journal": pub.journal,
"year": pub.year,
"doi": pub.doi,
"user_email": pub.user.email if pub.user else None,
"user_id": str(pub.user_id) if pub.user_id else None
})
return {
"total_publications": len(publications),
"publications": debug_data
}
# Debug endpoint to view all authors
@router.get("/authors")
async def debug_authors(db: AsyncSession = Depends(get_db)):
"""Debug endpoint to view all authors"""
result = await db.execute(select(Author))
authors = result.scalars().all()
return [{"id": a.id, "name": a.name} for a in authors]
# Debug endpoint to view all keywords
@router.get("/keywords")
async def debug_keywords(db: AsyncSession = Depends(get_db)):
"""🎯 Debug endpoint to view all keywords - THE HEART OF THE SYSTEM!"""
result = await db.execute(select(Keyword))
keywords = result.scalars().all()
return [{"id": k.id, "name": k.name} for k in keywords]

26
backend/app/download.py Normal file
View file

@ -0,0 +1,26 @@
from fastapi import APIRouter, HTTPException, Depends
from fastapi.responses import StreamingResponse
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.db import Publication, get_db
import io
router = APIRouter()
@router.get("/download/{publication_id}")
async def download_publication(publication_id: int, db: AsyncSession = Depends(get_db)):
result = await db.execute(select(Publication).where(Publication.id == publication_id))
publication = result.scalar_one_or_none()
if not publication:
raise HTTPException(status_code=404, detail="Publication not found")
file_bytes = publication.file
filename = publication.filename or "document.pdf"
return StreamingResponse(
io.BytesIO(file_bytes),
media_type="application/pdf",
headers={"Content-Disposition": f"attachment; filename={filename}"}
)

View file

@ -0,0 +1,362 @@
import os
import tempfile
from pathlib import Path
from typing import Tuple, Optional
from io import BytesIO
import logging
import re
from docx import Document as DocxDocument
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.units import inch
from weasyprint import HTML, CSS
import mammoth
logger = logging.getLogger(__name__)
class FileConverter:
@staticmethod
def get_file_extension(filename: str) -> str:
return Path(filename).suffix.lower()
@staticmethod
def convert_docx_to_pdf_reportlab(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
try:
with tempfile.TemporaryDirectory() as temp_dir:
# Saves docx
docx_path = os.path.join(temp_dir, "temp.docx")
with open(docx_path, "wb") as f:
f.write(docx_content)
# Reads docx
doc = DocxDocument(docx_path)
# Creates pdf
pdf_path = os.path.join(temp_dir, "output.pdf")
FileConverter._create_pdf_from_docx(doc, pdf_path)
# Reads pdf
with open(pdf_path, "rb") as f:
pdf_content = f.read()
# Creates filename
new_filename = original_filename.replace('.docx', '.pdf')
return pdf_content, new_filename
except Exception as e:
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
@staticmethod
def convert_docx_to_pdf_mammoth(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
try:
with tempfile.TemporaryDirectory() as temp_dir:
# Saves docx
docx_path = os.path.join(temp_dir, "temp.docx")
with open(docx_path, "wb") as f:
f.write(docx_content)
# Converts in HTML
with open(docx_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html_content = result.value
# Creates HTML
full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
# Converts to PDF
pdf_bytes = FileConverter._html_to_pdf(full_html)
new_filename = original_filename.replace('.docx', '.pdf')
return pdf_bytes, new_filename
except Exception as e:
raise Exception(f"Impossible to convert from DOCX to PDF: {str(e)}")
@staticmethod
def _create_pdf_from_docx(docx_doc, output_path: str):
doc = SimpleDocTemplate(output_path, pagesize=A4)
styles = getSampleStyleSheet()
story = []
# Custom styles
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=16,
spaceAfter=12,
textColor='black'
)
normal_style = ParagraphStyle(
'CustomNormal',
parent=styles['Normal'],
fontSize=11,
spaceAfter=6,
textColor='black'
)
for paragraph in docx_doc.paragraphs:
if paragraph.text.strip():
if len(paragraph.text) < 100 and paragraph.text.isupper():
style = title_style
elif paragraph.runs and paragraph.runs[0].bold:
style = title_style
else:
style = normal_style
p = Paragraph(paragraph.text, style)
story.append(p)
story.append(Spacer(1, 6))
if not story:
story.append(Paragraph("DOCX converted", normal_style))
# Costruisci il PDF
doc.build(story)
@staticmethod
def convert_latex_to_pdf(latex_content: bytes, original_filename: str) -> Tuple[bytes, str]:
try:
# Decodes LaTeX
latex_text = latex_content.decode('utf-8', errors='ignore')
# Converts to HTML
html_content = FileConverter._latex_to_html_advanced(latex_text)
# Converts to PDF
pdf_bytes = FileConverter._html_to_pdf(html_content)
# Creates filename
new_filename = original_filename.replace('.tex', '.pdf').replace('.latex', '.pdf')
return pdf_bytes, new_filename
except Exception as e:
raise Exception(f"Impossibile to convert from LaTeX to PDF: {str(e)}")
@staticmethod
def _latex_to_html_advanced(latex_text: str) -> str:
html = latex_text
html = re.sub(r'\\documentclass(?:\[[^\]]*\])?\{[^}]*\}', '', html)
html = re.sub(r'\\usepackage(?:\[[^\]]*\])?\{[^}]*\}', '', html)
html = re.sub(r'\\begin\{document\}', '', html)
html = re.sub(r'\\end\{document\}', '', html)
html = re.sub(r'\\maketitle', '', html)
html = re.sub(r'\\title\{([^}]*)\}', r'<h1 class="title">\1</h1>', html)
html = re.sub(r'\\author\{([^}]*)\}', r'<p class="author"><strong>Autore:</strong> \1</p>', html)
html = re.sub(r'\\date\{([^}]*)\}', r'<p class="date"><strong>Data:</strong> \1</p>', html)
html = re.sub(r'\\section\*?\{([^}]*)\}', r'<h2>\1</h2>', html)
html = re.sub(r'\\subsection\*?\{([^}]*)\}', r'<h3>\1</h3>', html)
html = re.sub(r'\\subsubsection\*?\{([^}]*)\}', r'<h4>\1</h4>', html)
html = re.sub(r'\\paragraph\{([^}]*)\}', r'<h5>\1</h5>', html)
html = re.sub(r'\\textbf\{([^}]*)\}', r'<strong>\1</strong>', html)
html = re.sub(r'\\textit\{([^}]*)\}', r'<em>\1</em>', html)
html = re.sub(r'\\emph\{([^}]*)\}', r'<em>\1</em>', html)
html = re.sub(r'\\underline\{([^}]*)\}', r'<u>\1</u>', html)
html = re.sub(r'\\texttt\{([^}]*)\}', r'<code>\1</code>', html)
html = re.sub(r'\$\$([^$]+)\$\$', r'<div class="math-block">\1</div>', html)
html = re.sub(r'\$([^$]+)\$', r'<span class="math-inline">\1</span>', html)
html = re.sub(r'\\begin\{itemize\}', '<ul>', html)
html = re.sub(r'\\end\{itemize\}', '</ul>', html)
html = re.sub(r'\\begin\{enumerate\}', '<ol>', html)
html = re.sub(r'\\end\{enumerate\}', '</ol>', html)
html = re.sub(r'\\item(?:\[[^\]]*\])?\s*', '<li>', html)
html = re.sub(r'\\begin\{quote\}', '<blockquote>', html)
html = re.sub(r'\\end\{quote\}', '</blockquote>', html)
html = re.sub(r'\\begin\{figure\}.*?\\end\{figure\}', '<div class="figure">[Figura]</div>', html,
flags=re.DOTALL)
html = re.sub(r'\\begin\{table\}.*?\\end\{table\}', '<div class="table">[Tabella]</div>', html, flags=re.DOTALL)
html = re.sub(r'\\[a-zA-Z]+(?:\[[^\]]*\])?\{[^}]*\}', '', html)
html = re.sub(r'\\[a-zA-Z]+', '', html)
html = re.sub(r'\\\\', '<br>', html)
html = re.sub(r'\n\s*\n', '</p><p>', html)
html = re.sub(r'\s+', ' ', html)
html = html.strip()
return FileConverter._wrap_html_with_styles(html, "LaTeX Document")
@staticmethod
def _wrap_html_with_styles(content: str, title: str) -> str:
html_template = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{title}</title>
<style>
@page {{
size: A4;
margin: 2cm;
}}
body {{
font-family: 'Times New Roman', serif;
font-size: 12pt;
line-height: 1.6;
text-align: justify;
color: #000;
}}
.title {{
font-size: 20pt;
font-weight: bold;
text-align: center;
margin-bottom: 16pt;
}}
.author, .date {{
text-align: center;
margin-bottom: 12pt;
font-style: italic;
}}
h1, h2 {{
font-size: 16pt;
font-weight: bold;
margin-top: 20pt;
margin-bottom: 12pt;
}}
h3 {{
font-size: 14pt;
font-weight: bold;
margin-top: 16pt;
margin-bottom: 10pt;
}}
h4, h5 {{
font-size: 12pt;
font-weight: bold;
margin-top: 12pt;
margin-bottom: 8pt;
}}
p {{
margin-bottom: 12pt;
text-indent: 0;
}}
ul, ol {{
margin-bottom: 12pt;
padding-left: 30pt;
}}
li {{
margin-bottom: 6pt;
}}
blockquote {{
margin: 12pt 20pt;
padding: 8pt;
border-left: 3pt solid #ccc;
font-style: italic;
}}
code {{
font-family: 'Courier New', monospace;
background-color: #f5f5f5;
padding: 2pt;
}}
.math-block {{
text-align: center;
margin: 12pt 0;
font-family: 'Times New Roman', serif;
}}
.math-inline {{
font-family: 'Times New Roman', serif;
}}
.figure, .table {{
text-align: center;
margin: 20pt 0;
padding: 10pt;
border: 1pt solid #ccc;
background-color: #f9f9f9;
}}
strong {{ font-weight: bold; }}
em {{ font-style: italic; }}
u {{ text-decoration: underline; }}
</style>
</head>
<body>
<div>{content}</div>
</body>
</html>
"""
return html_template
@staticmethod
def _html_to_pdf(html_content: str) -> bytes:
try:
# Creates PDF
html_doc = HTML(string=html_content)
pdf_bytes = html_doc.write_pdf()
return pdf_bytes
except Exception as e:
raise Exception(f"Impossible to convert from HTML to PDF: {str(e)}")
@staticmethod
def convert_to_pdf_if_needed(file_content: bytes, filename: str) -> Tuple[bytes, str]:
extension = FileConverter.get_file_extension(filename)
if extension == '.pdf':
return file_content, filename
elif extension == '.docx':
return FileConverter.convert_docx_to_pdf_mammoth(file_content, filename)
elif extension in ['.tex', '.latex']:
return FileConverter.convert_latex_to_pdf(file_content, filename)
else:
raise Exception(f"Format not supported: {extension}")
class AdvancedDocxConverter:
@staticmethod
def convert_docx_with_pandoc(docx_content: bytes, original_filename: str) -> Tuple[bytes, str]:
try:
import pypandoc
with tempfile.TemporaryDirectory() as temp_dir:
# Saves DOCX
docx_path = os.path.join(temp_dir, "temp.docx")
with open(docx_path, "wb") as f:
f.write(docx_content)
# Converts to HTML
html_content = pypandoc.convert_file(docx_path, 'html')
full_html = FileConverter._wrap_html_with_styles(html_content, "DOCX Document")
# Converts HTML to PDF
pdf_bytes = FileConverter._html_to_pdf(full_html)
new_filename = original_filename.replace('.docx', '.pdf')
return pdf_bytes, new_filename
except ImportError:
logger.warning("pypandoc not found for DOCX")
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)
except Exception as e:
logger.warning(f"pandoc error in DOCX: {e}, fallback to standard converter")
return FileConverter.convert_docx_to_pdf_mammoth(docx_content, original_filename)

View file

@ -0,0 +1,233 @@
from fastapi import Depends, APIRouter, Query, HTTPException
from sqlalchemy import select, or_, and_, asc, desc
from sqlalchemy.orm import selectinload
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List, Optional
from app.db import Publication, get_db, Keyword, Author, User
from app.schemas import PublicationOut, UserPublicationOut
from app.users import current_active_user
# Create router for publication endpoints
router = APIRouter()
# Endpoint to delete a publication
@router.delete("/publications/{publication_id}")
async def delete_publication(
publication_id: int,
user: User = Depends(current_active_user),
db: AsyncSession = Depends(get_db)
):
"""
Delete a publication owned by the current user
"""
# Find the publication with relations
result = await db.execute(
select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords)
).where(
and_(
Publication.id == publication_id,
Publication.user_id == user.id # Security: only user's own publications
)
)
)
publication = result.scalar_one_or_none()
if not publication:
raise HTTPException(
status_code=404,
detail="Publication not found or you don't have permission to delete it"
)
publication_title = publication.title
# Delete the publication (many-to-many relations are deleted automatically)
await db.delete(publication)
await db.commit()
print(f"🗑️ Publication deleted: '{publication_title}' (ID: {publication_id}) by user {user.email}")
return {"message": f"Publication '{publication_title}' successfully deleted"}
# Endpoint for user publications with sorting
@router.get("/users/me/publications", response_model=List[UserPublicationOut])
async def get_user_publications(
order_by: Optional[str] = Query("date_desc",
description="Sort by: date_asc, date_desc, title_asc, title_desc"),
user: User = Depends(current_active_user),
db: AsyncSession = Depends(get_db)
):
"""
Returns all publications uploaded by the current user with sorting
"""
stmt = select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords)
).where(
Publication.user_id == user.id
)
# Sorting management
if order_by == "date_asc":
stmt = stmt.order_by(asc(Publication.upload_date))
elif order_by == "date_desc":
stmt = stmt.order_by(desc(Publication.upload_date))
elif order_by == "title_asc":
stmt = stmt.order_by(asc(Publication.title))
elif order_by == "title_desc":
stmt = stmt.order_by(desc(Publication.title))
else:
# Default: descending by date (most recent first)
stmt = stmt.order_by(desc(Publication.upload_date))
result = await db.execute(stmt)
publications = result.scalars().all()
print(f"🔍 User {user.email} (ID: {user.id}) has {len(publications)} publications (sorted by: {order_by})")
return publications
# Search publications endpoint
@router.get("/publications", response_model=List[PublicationOut])
async def get_publications(
search: Optional[str] = Query(None,
description="Search by title, author or keyword. For multiple keywords use spaces: 'keyword1 keyword2'"),
order_by: Optional[str] = Query("date_desc",
description="Sort by: date_asc, date_desc, title_asc, title_desc"),
db: AsyncSession = Depends(get_db)
):
"""
🔍 ADVANCED SEARCH SYSTEM WITH KEYWORDS
Search function with priority and sorting:
1. Keywords (highest priority) - supports multiple search with spaces
2. Authors (medium priority)
3. Title (lowest priority)
Keywords are the core of the search system!
"""
print(f"🔍 Search: '{search}' | Sort by: {order_by}")
# If no search query, return all sorted
if search is None or not search.strip():
stmt = select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords)
)
# Sorting management
if order_by == "date_asc":
stmt = stmt.order_by(asc(Publication.upload_date))
elif order_by == "date_desc":
stmt = stmt.order_by(desc(Publication.upload_date))
elif order_by == "title_asc":
stmt = stmt.order_by(asc(Publication.title))
elif order_by == "title_desc":
stmt = stmt.order_by(desc(Publication.title))
else:
# Default: descending by date
stmt = stmt.order_by(desc(Publication.upload_date))
result = await db.execute(stmt)
return result.scalars().all()
search_term = search.strip()
# Split search string into individual keywords
search_keywords = [kw.strip().lower() for kw in search_term.split() if kw.strip()]
print(f"🔍 Keywords to search: {search_keywords}")
# SET to track already found IDs
found_publication_ids = set()
final_results = []
# 🎯 1. SEARCH BY KEYWORDS (highest priority) - MULTIPLE SEARCH
if search_keywords:
print("🔍 Step 1: Searching by multiple keywords...")
# Create conditions for each keyword
keyword_conditions = []
for keyword in search_keywords:
keyword_pattern = f"%{keyword}%"
keyword_conditions.append(
Publication.keywords.any(Keyword.name.ilike(keyword_pattern))
)
# Publication must have ALL keywords (AND)
keyword_query = select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords)
).where(
and_(*keyword_conditions) # All conditions must be true
)
keyword_result = await db.execute(keyword_query)
keyword_publications = keyword_result.scalars().all()
for pub in keyword_publications:
if pub.id not in found_publication_ids:
final_results.append(pub)
found_publication_ids.add(pub.id)
pub_keywords = [k.name for k in pub.keywords]
print(f" ✅ Found by keywords: {pub.title} (keywords: {pub_keywords})")
# 📝 2. SEARCH BY AUTHORS (medium priority) - uses complete string
print("🔍 Step 2: Searching by authors...")
author_pattern = f"%{search_term}%"
author_query = select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords)
).join(Publication.authors).where(
Author.name.ilike(author_pattern)
)
author_result = await db.execute(author_query)
author_publications = author_result.scalars().all()
for pub in author_publications:
if pub.id not in found_publication_ids:
final_results.append(pub)
found_publication_ids.add(pub.id)
pub_authors = [a.name for a in pub.authors]
print(f" ✅ Found by author: {pub.title} (authors: {pub_authors})")
# 📰 3. SEARCH BY TITLE (lowest priority) - uses complete string
print("🔍 Step 3: Searching by title...")
title_pattern = f"%{search_term}%"
title_query = select(Publication).options(
selectinload(Publication.authors),
selectinload(Publication.keywords)
).where(
Publication.title.ilike(title_pattern)
)
title_result = await db.execute(title_query)
title_publications = title_result.scalars().all()
for pub in title_publications:
if pub.id not in found_publication_ids:
final_results.append(pub)
found_publication_ids.add(pub.id)
print(f" ✅ Found by title: {pub.title}")
# Apply sorting to final results
print(f"🔍 Applying sorting: {order_by}")
if order_by == "date_asc":
final_results.sort(key=lambda x: x.upload_date)
elif order_by == "date_desc":
final_results.sort(key=lambda x: x.upload_date, reverse=True)
elif order_by == "title_asc":
final_results.sort(key=lambda x: x.title.lower())
elif order_by == "title_desc":
final_results.sort(key=lambda x: x.title.lower(), reverse=True)
else:
# Default: descending by date
final_results.sort(key=lambda x: x.upload_date, reverse=True)
print(f"🔍 Total results found: {len(final_results)}")
return final_results

69
backend/app/schemas.py Normal file
View file

@ -0,0 +1,69 @@
import uuid
from pydantic import BaseModel
from typing import List, Optional
from datetime import datetime
from fastapi_users import schemas
class UserRead(schemas.BaseUser[uuid.UUID]):
first_name: Optional[str] = None
last_name: Optional[str] = None
class UserCreate(schemas.BaseUserCreate):
first_name: Optional[str] = None
last_name: Optional[str] = None
class UserUpdate(schemas.BaseUserUpdate):
first_name: Optional[str] = None
last_name: Optional[str] = None
class AuthorOut(BaseModel):
id: int
name: str
class Config:
orm_mode = True
class KeywordOut(BaseModel):
id: int
name: str
class Config:
orm_mode = True
class PublicationOut(BaseModel):
id: int
title: str
filename: Optional[str]
upload_date: datetime
journal: Optional[str] = None
year: Optional[int] = None
doi: Optional[str] = None
authors: List[AuthorOut]
keywords: List[KeywordOut]
user_id: Optional[uuid.UUID] = None
class Config:
orm_mode = True
class UserPublicationOut(BaseModel):
id: int
title: str
filename: Optional[str]
upload_date: datetime
journal: Optional[str] = None
year: Optional[int] = None
doi: Optional[str] = None
authors: List[AuthorOut]
keywords: List[KeywordOut]
class Config:
orm_mode = True

253
backend/app/upload.py Normal file
View file

@ -0,0 +1,253 @@
from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import joinedload
from app.db import Publication, Author, Keyword, User, get_db
from app.utils import parser, nlp
from app.users import current_active_user
from app.file_converter import FileConverter, AdvancedDocxConverter
from typing import Optional
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post("/upload/")
async def upload_publication(
file: UploadFile = File(...),
bibtex: Optional[UploadFile] = File(None),
title: Optional[str] = Form(None),
authors: Optional[str] = Form(None),
year: Optional[int] = Form(None),
journal: Optional[str] = Form(None),
doi: Optional[str] = Form(None),
db: AsyncSession = Depends(get_db),
user: User = Depends(current_active_user)
):
try:
bibtex_metadata = None
if bibtex is not None:
try:
bibtex_content = (await bibtex.read()).decode("utf-8")
b_title, b_authors, b_year, b_journal, b_doi = parser.bibtex(bibtex_content)
bibtex_metadata = {
"title": b_title,
"authors": b_authors,
"year": b_year,
"journal": b_journal,
"doi": b_doi
}
title = title or b_title
authors = authors or b_authors
year = year or b_year
journal = journal or b_journal
doi = doi or b_doi
logger.info(f"BibTeX processed. Metadatas extracted: {bibtex_metadata}")
except Exception as e:
logger.error(f"Parsing BibTeX error: {e}")
raise HTTPException(
status_code=400,
detail=f"Parsing BibTeX error: {str(e)}"
)
if doi and not is_valid_doi(doi):
raise HTTPException(
status_code=400,
detail="DOI invalid. Use this format: 10.xxxx/xxxxx"
)
if doi:
existing_doi = await db.execute(
select(Publication).where(Publication.doi == doi)
)
if existing_doi.scalar_one_or_none():
raise HTTPException(
status_code=400,
detail="DOI existing"
)
if bibtex is None:
missing_fields = []
if not title: missing_fields.append("title")
if not authors: missing_fields.append("authors")
if not year: missing_fields.append("year")
if not journal: missing_fields.append("journal")
if missing_fields:
raise HTTPException(
status_code=400,
detail=f"Missing fields: {', '.join(missing_fields)}. "
f"Insert fields or upload a BibTeX."
)
logger.info("Manual mode")
else:
if not all([title, authors, year, journal]):
missing_from_bibtex = []
if not title: missing_from_bibtex.append("title")
if not authors: missing_from_bibtex.append("authors")
if not year: missing_from_bibtex.append("year")
if not journal: missing_from_bibtex.append("journal")
logger.error(f"Missing from BibTeX: {missing_from_bibtex}")
raise HTTPException(
status_code=400,
detail=f"Missing fields: {', '.join(missing_from_bibtex)}. "
)
logger.info("BibTeX mode")
if not file:
raise HTTPException(status_code=400, detail="File needed")
allowed_extensions = ['.pdf', '.docx', '.tex', '.latex']
file_extension = '.' + file.filename.split('.')[-1].lower() if '.' in file.filename else ''
if file_extension not in allowed_extensions:
logger.error(f"Extension not allowed: {file_extension}")
raise HTTPException(
status_code=400,
detail=f"Extension not allowed, please upload these: {', '.join(allowed_extensions)}"
)
content = await file.read()
logger.info(f"File uploaded: {file.filename} ({len(content)} bytes)")
try:
file_ext = FileConverter.get_file_extension(file.filename)
conversion_method = "none"
if file_ext == '.docx':
try:
converted_content, final_filename = AdvancedDocxConverter.convert_docx_with_pandoc(
content, file.filename
)
conversion_method = "pandoc"
logger.info(f"DOCX converted pandoc: {file.filename} -> {final_filename}")
except Exception as pandoc_error:
logger.warning(f"Pandoc failed with DOCX: {pandoc_error}, use mammoth")
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
content, file.filename
)
conversion_method = "mammoth"
logger.info(f"DOCX converted with mammoth: {file.filename} -> {final_filename}")
else:
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
content, file.filename
)
conversion_method = "standard" if file_ext in ['.tex', '.latex'] else "none"
logger.info(f"File processed: {file.filename} -> {final_filename}")
except Exception as e:
logger.error(f"Error while converting the file: {e}")
raise HTTPException(
status_code=500,
detail=f"Error while converting the file: {str(e)}"
)
try:
text = parser.extract_text(file.filename, content)
keywords = nlp.extract_keywords(text)
logger.info(f"{len(keywords)} keywords extracted")
except Exception as e:
logger.warning(f"Error while extracting keywords: {e}")
keywords = []
author_names = [a.strip() for a in authors.split(",") if a.strip()]
keyword_names = [k.strip().lower() for k in keywords if k.strip()]
logger.info(f"Authors to process: {author_names}")
logger.info(f"Keywords to process: {keyword_names}")
author_objs = []
for name in author_names:
result = await db.execute(select(Author).where(Author.name == name))
author = result.scalar_one_or_none()
if not author:
author = Author(name=name)
db.add(author)
await db.flush()
logger.info(f"New author created: {name}")
else:
logger.info(f"Existing author found: {name}")
author_objs.append(author)
keyword_objs = []
for kw in keyword_names:
result = await db.execute(select(Keyword).where(Keyword.name == kw))
keyword = result.scalar_one_or_none()
if not keyword:
keyword = Keyword(name=kw)
db.add(keyword)
await db.flush()
logger.info(f"Keyword created: {kw}")
else:
logger.info(f"Existing keyword found: {kw}")
keyword_objs.append(keyword)
publication = Publication(
title=title,
file=converted_content,
filename=final_filename,
journal=journal,
year=year,
doi=doi,
user_id=user.id,
authors=author_objs,
keywords=keyword_objs
)
db.add(publication)
await db.commit()
await db.refresh(publication)
result = await db.execute(
select(Publication)
.options(joinedload(Publication.authors), joinedload(Publication.keywords))
.where(Publication.id == publication.id)
)
publication_with_rel = result.unique().scalar_one()
author_names_response = [author.name for author in publication_with_rel.authors]
keyword_names_response = [kw.name for kw in publication_with_rel.keywords]
response_data = {
"id": publication_with_rel.id,
"title": publication_with_rel.title,
"authors": author_names_response,
"keywords": keyword_names_response,
"journal": publication_with_rel.journal,
"year": publication_with_rel.year,
"doi": publication_with_rel.doi,
"original_filename": file.filename,
"converted_filename": final_filename,
"conversion_method": conversion_method
}
if bibtex is not None:
response_data["metadata_source"] = "bibtex"
response_data["bibtex_data"] = bibtex_metadata
logger.info("Saved with BibTeX metadata")
else:
response_data["metadata_source"] = "manual"
logger.info("Saved with classical metadata")
return response_data
except HTTPException:
raise
except Exception as e:
await db.rollback()
raise HTTPException(
status_code=500,
detail=f"Upload error: {str(e)}"
)
def is_valid_doi(doi: str) -> bool:
import re
doi_pattern = r'^10\.\d{4,}/[-._;()/:\w\[\]]+$'
return bool(re.match(doi_pattern, doi, re.IGNORECASE))

56
backend/app/users.py Normal file
View file

@ -0,0 +1,56 @@
import uuid
from typing import Optional
from fastapi import Depends, Request
from fastapi_users import BaseUserManager, FastAPIUsers, UUIDIDMixin, models
from fastapi_users.authentication import (
AuthenticationBackend,
BearerTransport,
JWTStrategy,
)
from fastapi_users.db import SQLAlchemyUserDatabase
from app.db import User, get_user_db
#CHANGE ME
SECRET = "1d90d4315c0a0313fb65211fa82e88129cddedb8b662553fbd38f44be9dc818bbd8623ca0177d965e762ee9727b5f6a2bd98481311ecccbcae846bff4f57b8ce72a51fca3278caa05ff18e54c563788d2a67b44be6fc667c12d1b6c2d869f6637b67025a6aa938e811616f27c160a13dc7b653e56a9823f61a165cdf671f734c"
class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
reset_password_token_secret = SECRET
verification_token_secret = SECRET
async def on_after_register(self, user: User, request: Optional[Request] = None):
print(f"User {user.id} has registered.")
async def on_after_forgot_password(
self, user: User, token: str, request: Optional[Request] = None
):
print(f"User {user.id} has forgot their password. Reset token: {token}")
async def on_after_request_verify(
self, user: User, token: str, request: Optional[Request] = None
):
print(f"Verification requested for user {user.id}. Verification token: {token}")
async def get_user_manager(user_db: SQLAlchemyUserDatabase = Depends(get_user_db)):
yield UserManager(user_db)
bearer_transport = BearerTransport(tokenUrl="auth/jwt/login")
def get_jwt_strategy() -> JWTStrategy[models.UP, models.ID]:
return JWTStrategy(secret=SECRET, lifetime_seconds=3600)
auth_backend = AuthenticationBackend(
name="jwt",
transport=bearer_transport,
get_strategy=get_jwt_strategy,
)
fastapi_users = FastAPIUsers[User, uuid.UUID](get_user_manager, [auth_backend])
current_active_user = fastapi_users.current_user(active=True)

6
backend/app/utils/nlp.py Normal file
View file

@ -0,0 +1,6 @@
import yake
def extract_keywords(text: str, num_keywords: int = 5) -> list:
kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=num_keywords)
keywords = kw_extractor.extract_keywords(text)
return [kw for kw, _ in keywords]

165
backend/app/utils/parser.py Normal file
View file

@ -0,0 +1,165 @@
import bibtexparser
import io
import logging
from typing import Tuple, Optional
from pdfminer.high_level import extract_text as pdf_extract_text
from pdfminer.high_level import extract_text_to_fp
import tempfile
import os
logger = logging.getLogger(__name__)
def bibtex(bibtex_content: str) -> Tuple[Optional[str], Optional[str], Optional[int], Optional[str], Optional[str]]:
"""
Estrae title, authors, year, journal, doi dal primo record di un file bibtex.
Ritorna una tupla (title, authors, year, journal, doi).
"""
bib_database = bibtexparser.load(io.StringIO(bibtex_content))
if not bib_database.entries:
return (None, None, None, None, None)
entry = bib_database.entries[0]
title = entry.get('title')
authors = entry.get('authors') # o 'author' se il campo è diverso
year = int(entry['year']) if 'year' in entry else None
journal = entry.get('journal')
doi = entry.get('doi')
return (title, authors, year, journal, doi)
def extract_text(filename: str, content: bytes) -> str:
"""
🎯 FUNZIONE FONDAMENTALE: Estrae testo da file PDF per l'analisi delle keywords
Args:
filename: Nome del file (per determinare il tipo)
content: Contenuto del file in bytes
Returns:
str: Testo estratto dal documento
"""
try:
# Determina l'estensione del file
file_extension = os.path.splitext(filename.lower())[1]
if file_extension == '.pdf':
return extract_text_from_pdf(content)
elif file_extension == '.docx':
return extract_text_from_docx(content)
elif file_extension in ['.tex', '.latex']:
return extract_text_from_latex(content)
else:
logger.warning(f"Tipo di file non supportato per estrazione testo: {file_extension}")
return ""
except Exception as e:
logger.error(f"Errore nell'estrazione del testo da {filename}: {e}")
return ""
def extract_text_from_pdf(pdf_content: bytes) -> str:
"""
Estrae testo da contenuto PDF usando pdfminer
"""
try:
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
temp_file.write(pdf_content)
temp_file.flush()
# Estrae il testo usando pdfminer
text = pdf_extract_text(temp_file.name)
# Pulisce il file temporaneo
os.unlink(temp_file.name)
logger.info(f"Estratto testo PDF: {len(text)} caratteri")
return text or ""
except Exception as e:
logger.error(f"Errore nell'estrazione testo da PDF: {e}")
return ""
def extract_text_from_docx(docx_content: bytes) -> str:
"""
Estrae testo da contenuto DOCX usando python-docx
"""
try:
from docx import Document
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as temp_file:
temp_file.write(docx_content)
temp_file.flush()
# Estrae il testo usando python-docx
doc = Document(temp_file.name)
text_parts = []
for paragraph in doc.paragraphs:
text_parts.append(paragraph.text)
text = '\n'.join(text_parts)
# Pulisce il file temporaneo
os.unlink(temp_file.name)
logger.info(f"Estratto testo DOCX: {len(text)} caratteri")
return text
except Exception as e:
logger.error(f"Errore nell'estrazione testo da DOCX: {e}")
return ""
def extract_text_from_latex(latex_content: bytes) -> str:
"""
Estrae testo da contenuto LaTeX rimuovendo i comandi LaTeX
"""
try:
from pylatexenc.latex2text import LatexNodes2Text
# Decodifica il contenuto
latex_text = latex_content.decode('utf-8', errors='ignore')
# Converte LaTeX in testo semplice
converter = LatexNodes2Text()
text = converter.latex_to_text(latex_text)
logger.info(f"Estratto testo LaTeX: {len(text)} caratteri")
return text
except Exception as e:
logger.error(f"Errore nell'estrazione testo da LaTeX: {e}")
# Fallback: rimuovi manualmente i comandi LaTeX più comuni
try:
latex_text = latex_content.decode('utf-8', errors='ignore')
# Rimuove comandi LaTeX di base
import re
text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', latex_text)
text = re.sub(r'\\[a-zA-Z]+', '', text)
text = re.sub(r'\{[^}]*\}', '', text)
text = re.sub(r'%.*', '', text) # Rimuove commenti
return text.strip()
except:
return ""
def clean_extracted_text(text: str) -> str:
"""
Pulisce il testo estratto per migliorare l'estrazione delle keywords
"""
import re
# Rimuove caratteri di controllo e spazi multipli
text = re.sub(r'\s+', ' ', text)
# Rimuove caratteri speciali eccessivi
text = re.sub(r'[^\w\s\-.,;:()[\]{}]', ' ', text)
# Rimuove linee molto corte (probabilmente header/footer)
lines = text.split('\n')
clean_lines = [line.strip() for line in lines if len(line.strip()) > 10]
return '\n'.join(clean_lines).strip()