mirror of
https://github.com/fccapria/scientify.git
synced 2026-01-12 02:36:10 +00:00
Initial release
This commit is contained in:
commit
ae5e4b8873
52 changed files with 17572 additions and 0 deletions
253
backend/app/upload.py
Normal file
253
backend/app/upload.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import joinedload
|
||||
|
||||
from app.db import Publication, Author, Keyword, User, get_db
|
||||
from app.utils import parser, nlp
|
||||
from app.users import current_active_user
|
||||
from app.file_converter import FileConverter, AdvancedDocxConverter
|
||||
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/upload/")
|
||||
async def upload_publication(
|
||||
file: UploadFile = File(...),
|
||||
bibtex: Optional[UploadFile] = File(None),
|
||||
title: Optional[str] = Form(None),
|
||||
authors: Optional[str] = Form(None),
|
||||
year: Optional[int] = Form(None),
|
||||
journal: Optional[str] = Form(None),
|
||||
doi: Optional[str] = Form(None),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
user: User = Depends(current_active_user)
|
||||
):
|
||||
try:
|
||||
bibtex_metadata = None
|
||||
|
||||
if bibtex is not None:
|
||||
try:
|
||||
bibtex_content = (await bibtex.read()).decode("utf-8")
|
||||
b_title, b_authors, b_year, b_journal, b_doi = parser.bibtex(bibtex_content)
|
||||
bibtex_metadata = {
|
||||
"title": b_title,
|
||||
"authors": b_authors,
|
||||
"year": b_year,
|
||||
"journal": b_journal,
|
||||
"doi": b_doi
|
||||
}
|
||||
|
||||
title = title or b_title
|
||||
authors = authors or b_authors
|
||||
year = year or b_year
|
||||
journal = journal or b_journal
|
||||
doi = doi or b_doi
|
||||
|
||||
logger.info(f"BibTeX processed. Metadatas extracted: {bibtex_metadata}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Parsing BibTeX error: {e}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Parsing BibTeX error: {str(e)}"
|
||||
)
|
||||
|
||||
if doi and not is_valid_doi(doi):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="DOI invalid. Use this format: 10.xxxx/xxxxx"
|
||||
)
|
||||
|
||||
if doi:
|
||||
existing_doi = await db.execute(
|
||||
select(Publication).where(Publication.doi == doi)
|
||||
)
|
||||
if existing_doi.scalar_one_or_none():
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="DOI existing"
|
||||
)
|
||||
if bibtex is None:
|
||||
missing_fields = []
|
||||
if not title: missing_fields.append("title")
|
||||
if not authors: missing_fields.append("authors")
|
||||
if not year: missing_fields.append("year")
|
||||
if not journal: missing_fields.append("journal")
|
||||
|
||||
if missing_fields:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Missing fields: {', '.join(missing_fields)}. "
|
||||
f"Insert fields or upload a BibTeX."
|
||||
)
|
||||
logger.info("Manual mode")
|
||||
else:
|
||||
if not all([title, authors, year, journal]):
|
||||
missing_from_bibtex = []
|
||||
if not title: missing_from_bibtex.append("title")
|
||||
if not authors: missing_from_bibtex.append("authors")
|
||||
if not year: missing_from_bibtex.append("year")
|
||||
if not journal: missing_from_bibtex.append("journal")
|
||||
|
||||
logger.error(f"Missing from BibTeX: {missing_from_bibtex}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Missing fields: {', '.join(missing_from_bibtex)}. "
|
||||
)
|
||||
logger.info("BibTeX mode")
|
||||
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="File needed")
|
||||
|
||||
allowed_extensions = ['.pdf', '.docx', '.tex', '.latex']
|
||||
file_extension = '.' + file.filename.split('.')[-1].lower() if '.' in file.filename else ''
|
||||
if file_extension not in allowed_extensions:
|
||||
logger.error(f"Extension not allowed: {file_extension}")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Extension not allowed, please upload these: {', '.join(allowed_extensions)}"
|
||||
)
|
||||
|
||||
content = await file.read()
|
||||
logger.info(f"File uploaded: {file.filename} ({len(content)} bytes)")
|
||||
|
||||
try:
|
||||
file_ext = FileConverter.get_file_extension(file.filename)
|
||||
conversion_method = "none"
|
||||
|
||||
if file_ext == '.docx':
|
||||
try:
|
||||
converted_content, final_filename = AdvancedDocxConverter.convert_docx_with_pandoc(
|
||||
content, file.filename
|
||||
)
|
||||
conversion_method = "pandoc"
|
||||
logger.info(f"DOCX converted pandoc: {file.filename} -> {final_filename}")
|
||||
except Exception as pandoc_error:
|
||||
logger.warning(f"Pandoc failed with DOCX: {pandoc_error}, use mammoth")
|
||||
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
|
||||
content, file.filename
|
||||
)
|
||||
conversion_method = "mammoth"
|
||||
logger.info(f"DOCX converted with mammoth: {file.filename} -> {final_filename}")
|
||||
else:
|
||||
converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
|
||||
content, file.filename
|
||||
)
|
||||
conversion_method = "standard" if file_ext in ['.tex', '.latex'] else "none"
|
||||
logger.info(f"File processed: {file.filename} -> {final_filename}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while converting the file: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Error while converting the file: {str(e)}"
|
||||
)
|
||||
|
||||
try:
|
||||
text = parser.extract_text(file.filename, content)
|
||||
keywords = nlp.extract_keywords(text)
|
||||
logger.info(f"{len(keywords)} keywords extracted")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error while extracting keywords: {e}")
|
||||
keywords = []
|
||||
|
||||
author_names = [a.strip() for a in authors.split(",") if a.strip()]
|
||||
keyword_names = [k.strip().lower() for k in keywords if k.strip()]
|
||||
|
||||
logger.info(f"Authors to process: {author_names}")
|
||||
logger.info(f"Keywords to process: {keyword_names}")
|
||||
|
||||
author_objs = []
|
||||
for name in author_names:
|
||||
result = await db.execute(select(Author).where(Author.name == name))
|
||||
author = result.scalar_one_or_none()
|
||||
if not author:
|
||||
author = Author(name=name)
|
||||
db.add(author)
|
||||
await db.flush()
|
||||
logger.info(f"New author created: {name}")
|
||||
else:
|
||||
logger.info(f"Existing author found: {name}")
|
||||
author_objs.append(author)
|
||||
|
||||
keyword_objs = []
|
||||
for kw in keyword_names:
|
||||
result = await db.execute(select(Keyword).where(Keyword.name == kw))
|
||||
keyword = result.scalar_one_or_none()
|
||||
if not keyword:
|
||||
keyword = Keyword(name=kw)
|
||||
db.add(keyword)
|
||||
await db.flush()
|
||||
logger.info(f"Keyword created: {kw}")
|
||||
else:
|
||||
logger.info(f"Existing keyword found: {kw}")
|
||||
keyword_objs.append(keyword)
|
||||
|
||||
publication = Publication(
|
||||
title=title,
|
||||
file=converted_content,
|
||||
filename=final_filename,
|
||||
journal=journal,
|
||||
year=year,
|
||||
doi=doi,
|
||||
user_id=user.id,
|
||||
authors=author_objs,
|
||||
keywords=keyword_objs
|
||||
)
|
||||
db.add(publication)
|
||||
await db.commit()
|
||||
await db.refresh(publication)
|
||||
|
||||
result = await db.execute(
|
||||
select(Publication)
|
||||
.options(joinedload(Publication.authors), joinedload(Publication.keywords))
|
||||
.where(Publication.id == publication.id)
|
||||
)
|
||||
publication_with_rel = result.unique().scalar_one()
|
||||
|
||||
author_names_response = [author.name for author in publication_with_rel.authors]
|
||||
keyword_names_response = [kw.name for kw in publication_with_rel.keywords]
|
||||
|
||||
response_data = {
|
||||
"id": publication_with_rel.id,
|
||||
"title": publication_with_rel.title,
|
||||
"authors": author_names_response,
|
||||
"keywords": keyword_names_response,
|
||||
"journal": publication_with_rel.journal,
|
||||
"year": publication_with_rel.year,
|
||||
"doi": publication_with_rel.doi,
|
||||
"original_filename": file.filename,
|
||||
"converted_filename": final_filename,
|
||||
"conversion_method": conversion_method
|
||||
}
|
||||
|
||||
if bibtex is not None:
|
||||
response_data["metadata_source"] = "bibtex"
|
||||
response_data["bibtex_data"] = bibtex_metadata
|
||||
logger.info("Saved with BibTeX metadata")
|
||||
else:
|
||||
response_data["metadata_source"] = "manual"
|
||||
logger.info("Saved with classical metadata")
|
||||
|
||||
return response_data
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await db.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Upload error: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
def is_valid_doi(doi: str) -> bool:
|
||||
import re
|
||||
doi_pattern = r'^10\.\d{4,}/[-._;()/:\w\[\]]+$'
|
||||
return bool(re.match(doi_pattern, doi, re.IGNORECASE))
|
||||
Loading…
Add table
Add a link
Reference in a new issue