scientify/backend/app/upload.py

from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import joinedload

from app.db import Publication, Author, Keyword, User, get_db
from app.utils import parser, nlp
from app.users import current_active_user
from app.file_converter import FileConverter, AdvancedDocxConverter

from typing import Optional
import logging

logger = logging.getLogger(__name__)

router = APIRouter()


@router.post("/upload/")
async def upload_publication(
        file: UploadFile = File(...),
        bibtex: Optional[UploadFile] = File(None),
        title: Optional[str] = Form(None),
        authors: Optional[str] = Form(None),
        year: Optional[int] = Form(None),
        journal: Optional[str] = Form(None),
        doi: Optional[str] = Form(None),
        db: AsyncSession = Depends(get_db),
        user: User = Depends(current_active_user)
):
    try:
        bibtex_metadata = None

        if bibtex is not None:
            try:
                bibtex_content = (await bibtex.read()).decode("utf-8")
                b_title, b_authors, b_year, b_journal, b_doi = parser.bibtex(bibtex_content)
                bibtex_metadata = {
                    "title": b_title,
                    "authors": b_authors,
                    "year": b_year,
                    "journal": b_journal,
                    "doi": b_doi
                }

                title = title or b_title
                authors = authors or b_authors
                year = year or b_year
                journal = journal or b_journal
                doi = doi or b_doi

                logger.info(f"BibTeX processed. Metadatas extracted: {bibtex_metadata}")

            except Exception as e:
                logger.error(f"Parsing BibTeX error: {e}")
                raise HTTPException(
                    status_code=400,
                    detail=f"Parsing BibTeX error: {str(e)}"
                )

        if doi and not is_valid_doi(doi):
            raise HTTPException(
                status_code=400,
                detail="DOI invalid. Use this format: 10.xxxx/xxxxx"
            )

        if doi:
            existing_doi = await db.execute(
                select(Publication).where(Publication.doi == doi)
            )
            if existing_doi.scalar_one_or_none():
                raise HTTPException(
                    status_code=400,
                    detail="DOI existing"
                )
        if bibtex is None:
            missing_fields = []
            if not title: missing_fields.append("title")
            if not authors: missing_fields.append("authors")
            if not year: missing_fields.append("year")
            if not journal: missing_fields.append("journal")

            if missing_fields:
                raise HTTPException(
                    status_code=400,
                    detail=f"Missing fields: {', '.join(missing_fields)}. "
                           f"Insert fields or upload a BibTeX."
                )
            logger.info("Manual mode")
        else:
            if not all([title, authors, year, journal]):
                missing_from_bibtex = []
                if not title: missing_from_bibtex.append("title")
                if not authors: missing_from_bibtex.append("authors")
                if not year: missing_from_bibtex.append("year")
                if not journal: missing_from_bibtex.append("journal")

                logger.error(f"Missing from BibTeX: {missing_from_bibtex}")
                raise HTTPException(
                    status_code=400,
                    detail=f"Missing fields: {', '.join(missing_from_bibtex)}. "
                )
            logger.info("BibTeX mode")

        if not file:
            raise HTTPException(status_code=400, detail="File needed")

        allowed_extensions = ['.pdf', '.docx', '.tex', '.latex']
        file_extension = '.' + file.filename.split('.')[-1].lower() if '.' in file.filename else ''
        if file_extension not in allowed_extensions:
            logger.error(f"Extension not allowed: {file_extension}")
            raise HTTPException(
                status_code=400,
                detail=f"Extension not allowed, please upload these: {', '.join(allowed_extensions)}"
            )

        content = await file.read()
        logger.info(f"File uploaded: {file.filename} ({len(content)} bytes)")

        try:
            file_ext = FileConverter.get_file_extension(file.filename)
            conversion_method = "none"

            if file_ext == '.docx':
                try:
                    converted_content, final_filename = AdvancedDocxConverter.convert_docx_with_pandoc(
                        content, file.filename
                    )
                    conversion_method = "pandoc"
                    logger.info(f"DOCX converted pandoc: {file.filename} -> {final_filename}")
                except Exception as pandoc_error:
                    logger.warning(f"Pandoc failed with DOCX: {pandoc_error}, use mammoth")
                    converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
                        content, file.filename
                    )
                    conversion_method = "mammoth"
                    logger.info(f"DOCX converted with mammoth: {file.filename} -> {final_filename}")
            else:
                converted_content, final_filename = FileConverter.convert_to_pdf_if_needed(
                    content, file.filename
                )
                conversion_method = "standard" if file_ext in ['.tex', '.latex'] else "none"
                logger.info(f"File processed: {file.filename} -> {final_filename}")

        except Exception as e:
            logger.error(f"Error while converting the file: {e}")
            raise HTTPException(
                status_code=500,
                detail=f"Error while converting the file: {str(e)}"
            )

        try:
            text = parser.extract_text(file.filename, content)
            keywords = nlp.extract_keywords(text)
            logger.info(f"{len(keywords)} keywords extracted")
        except Exception as e:
            logger.warning(f"Error while extracting keywords: {e}")
            keywords = []

        author_names = [a.strip() for a in authors.split(",") if a.strip()]
        keyword_names = [k.strip().lower() for k in keywords if k.strip()]

        logger.info(f"Authors to process: {author_names}")
        logger.info(f"Keywords to process: {keyword_names}")

        author_objs = []
        for name in author_names:
            result = await db.execute(select(Author).where(Author.name == name))
            author = result.scalar_one_or_none()
            if not author:
                author = Author(name=name)
                db.add(author)
                await db.flush()
                logger.info(f"New author created: {name}")
            else:
                logger.info(f"Existing author found: {name}")
            author_objs.append(author)

        keyword_objs = []
        for kw in keyword_names:
            result = await db.execute(select(Keyword).where(Keyword.name == kw))
            keyword = result.scalar_one_or_none()
            if not keyword:
                keyword = Keyword(name=kw)
                db.add(keyword)
                await db.flush()
                logger.info(f"Keyword created: {kw}")
            else:
                logger.info(f"Existing keyword found: {kw}")
            keyword_objs.append(keyword)

        publication = Publication(
            title=title,
            file=converted_content,
            filename=final_filename,
            journal=journal,
            year=year,
            doi=doi,
            user_id=user.id,
            authors=author_objs,
            keywords=keyword_objs
        )
        db.add(publication)
        await db.commit()
        await db.refresh(publication)

        result = await db.execute(
            select(Publication)
            .options(joinedload(Publication.authors), joinedload(Publication.keywords))
            .where(Publication.id == publication.id)
        )
        publication_with_rel = result.unique().scalar_one()

        author_names_response = [author.name for author in publication_with_rel.authors]
        keyword_names_response = [kw.name for kw in publication_with_rel.keywords]

        response_data = {
            "id": publication_with_rel.id,
            "title": publication_with_rel.title,
            "authors": author_names_response,
            "keywords": keyword_names_response,
            "journal": publication_with_rel.journal,
            "year": publication_with_rel.year,
            "doi": publication_with_rel.doi,
            "original_filename": file.filename,
            "converted_filename": final_filename,
            "conversion_method": conversion_method
        }

        if bibtex is not None:
            response_data["metadata_source"] = "bibtex"
            response_data["bibtex_data"] = bibtex_metadata
            logger.info("Saved with BibTeX metadata")
        else:
            response_data["metadata_source"] = "manual"
            logger.info("Saved with classical metadata")

        return response_data

    except HTTPException:
        raise
    except Exception as e:
        await db.rollback()
        raise HTTPException(
            status_code=500,
            detail=f"Upload error: {str(e)}"
        )


def is_valid_doi(doi: str) -> bool:
    import re
    doi_pattern = r'^10\.\d{4,}/[-._;()/:\w\[\]]+$'
    return bool(re.match(doi_pattern, doi, re.IGNORECASE))