Initial release

This commit is contained in:
Francesco Carmelo Capria 2025-06-21 18:15:33 +02:00
commit ae5e4b8873
52 changed files with 17572 additions and 0 deletions

253
backend/app/upload.py Normal file
View file

@ -0,0 +1,253 @@
from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import joinedload
from app.db import Publication, Author, Keyword, User, get_db
from app.utils import parser, nlp
from app.users import current_active_user
from app.file_converter import FileConverter, AdvancedDocxConverter
from typing import Optional
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
@router.post("/upload/")
async def upload_publication(
file: UploadFile = File(...),
bibtex: Optional[UploadFile] = File(None),
title: Optional[str] = Form(None),
authors: Optional[str] = Form(None),
year: Optional[int] = Form(None),
journal: Optional[str] = Form(None),
doi: Optional[str] = Form(None),
db: AsyncSession = Depends(get_db),
user: User = Depends(current_active_user)
):
try:
bibtex_metadata = None
if bibtex is not None:
try:
bibtex_content = (await bibtex.read()).decode("utf-8")
b_title, b_authors, b_year, b_journal, b_doi = parser.bibtex(bibtex_content)
bibtex_metadata = {
"title": b_title,
"authors": b_authors,
"year": b_year,
"journal": b_journal,
"doi": b_doi
}
title = title or b_title
authors = authors or b_authors
year = year or b_year
journal = journal or b_journal
doi = doi or b_doi
logger.info(f"BibTeX processed. Metadatas extracted: {bibtex_metadata}")
except Exception as e:
logger.error(f"Parsing BibTeX error: {e}")
raise HTTPException(
status_code=400,
detail=f"Parsing BibTeX error: {str(e)}"
)
if doi and not is_valid_doi(doi):
raise HTTPException(
status_code=400,
detail="DOI invalid. Use this format: 10.xxxx/xxxxx"
)
if doi:
existing_doi = await db.execute(
select(Publication).where(Publication.doi == doi)
)
if existing_doi.scalar_one_or_none():
raise HTTPException(
status_code=400,
detail="DOI existing"
)
if bibtex is None:
missing_fields = []
if not title: missing_fields.append("title")
if not authors: missing_fields.append("authors")
if not year: missing_fields.append("year")
if not journal: missing_fields.append("journal")
if missing_fields:
raise HTTPException(
status_code=400,
detail=f"Missing fields: {', '.join(missing_fields)}. "
f"Insert fields or upload a BibTeX."
)
logger.info("Manual mode")
else:
if not all([title, authors, year, journal]):
missing_from_bibtex = []
if not title: missing_from_bibtex.append("title")
if not authors: missing_from_bibtex.append("authors")
if not year: missing_from_bibtex.append("year")
if not journal: missing_from_bibtex.append("journal")
logger.error(f"Missing from BibTeX: {missing_from_bibtex}")
raise HTTPException(
status_code=400,
detail=f"Missing fields: {', '.join(missing_from_bibtex)}. "
)
logger.info("BibTeX mode")
if not file:
raise HTTPException(status_code=400, detail="File needed")
allowed_extensions = ['.pdf', '.docx', '.tex', '.latex']
file_extension = '.' + file.filename.split('.')[-1].lower() if '.' in file.filename else ''
if file_extension not in allowed_extensions:
logger.error(f"Extension not allowed: {file_extension}")
raise HTTPException(
status_code=400,
detail=f"Extension not allowed, please upload these: {', '.join(allowed_extensions)}"
)
content = await file.read()
logger.info(f"File uploaded: {file.filename} ({len(content)} bytes)")
try:
file_ext = FileConverter.get_file_extension(file.filename)
conversion_method = "none"
if file_ext == '.docx':
try:
converted_content, final_filename = AdvancedDocxConverter.convert_docx_with_pandoc(
content, file.filename
)
conversion_method = "pandoc"
logger.info(f"DOCX converted pandoc: {file.filename} -> {final_filename}")
except Exception as pandoc_error:
logger.warning(f"Pandoc failed with DOCX: {pandoc_error}, use mammoth")
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
content, file.filename
)
conversion_method = "mammoth"
logger.info(f"DOCX converted with mammoth: {file.filename} -> {final_filename}")
else:
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
content, file.filename
)
conversion_method = "standard" if file_ext in ['.tex', '.latex'] else "none"
logger.info(f"File processed: {file.filename} -> {final_filename}")
except Exception as e:
logger.error(f"Error while converting the file: {e}")
raise HTTPException(
status_code=500,
detail=f"Error while converting the file: {str(e)}"
)
try:
text = parser.extract_text(file.filename, content)
keywords = nlp.extract_keywords(text)
logger.info(f"{len(keywords)} keywords extracted")
except Exception as e:
logger.warning(f"Error while extracting keywords: {e}")
keywords = []
author_names = [a.strip() for a in authors.split(",") if a.strip()]
keyword_names = [k.strip().lower() for k in keywords if k.strip()]
logger.info(f"Authors to process: {author_names}")
logger.info(f"Keywords to process: {keyword_names}")
author_objs = []
for name in author_names:
result = await db.execute(select(Author).where(Author.name == name))
author = result.scalar_one_or_none()
if not author:
author = Author(name=name)
db.add(author)
await db.flush()
logger.info(f"New author created: {name}")
else:
logger.info(f"Existing author found: {name}")
author_objs.append(author)
keyword_objs = []
for kw in keyword_names:
result = await db.execute(select(Keyword).where(Keyword.name == kw))
keyword = result.scalar_one_or_none()
if not keyword:
keyword = Keyword(name=kw)
db.add(keyword)
await db.flush()
logger.info(f"Keyword created: {kw}")
else:
logger.info(f"Existing keyword found: {kw}")
keyword_objs.append(keyword)
publication = Publication(
title=title,
file=converted_content,
filename=final_filename,
journal=journal,
year=year,
doi=doi,
user_id=user.id,
authors=author_objs,
keywords=keyword_objs
)
db.add(publication)
await db.commit()
await db.refresh(publication)
result = await db.execute(
select(Publication)
.options(joinedload(Publication.authors), joinedload(Publication.keywords))
.where(Publication.id == publication.id)
)
publication_with_rel = result.unique().scalar_one()
author_names_response = [author.name for author in publication_with_rel.authors]
keyword_names_response = [kw.name for kw in publication_with_rel.keywords]
response_data = {
"id": publication_with_rel.id,
"title": publication_with_rel.title,
"authors": author_names_response,
"keywords": keyword_names_response,
"journal": publication_with_rel.journal,
"year": publication_with_rel.year,
"doi": publication_with_rel.doi,
"original_filename": file.filename,
"converted_filename": final_filename,
"conversion_method": conversion_method
}
if bibtex is not None:
response_data["metadata_source"] = "bibtex"
response_data["bibtex_data"] = bibtex_metadata
logger.info("Saved with BibTeX metadata")
else:
response_data["metadata_source"] = "manual"
logger.info("Saved with classical metadata")
return response_data
except HTTPException:
raise
except Exception as e:
await db.rollback()
raise HTTPException(
status_code=500,
detail=f"Upload error: {str(e)}"
)
def is_valid_doi(doi: str) -> bool:
import re
doi_pattern = r'^10\.\d{4,}/[-._;()/:\w\[\]]+$'
return bool(re.match(doi_pattern, doi, re.IGNORECASE))