Source code for polonaexplorer.plotter

"""Generates topic model maps for text containing target words."""
#!/usr/bin/env python
# coding: utf-8

import json
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
from sentence_transformers import SentenceTransformer
from torch import cuda

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],
)

logger = logging.getLogger(__name__)


OLLAMA_URL = "http://localhost:11434/api/generate"

SYSTEM_PROMPT = """
        <s>[INST] <<SYS>>
        You are a helpful, respectful and honest assistant for labeling topics.
        <</SYS>>
        """

EXAMPLE_PROMPT = """
        I have a topic that contains the following documents:
        - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
        - Meat, but especially beef, is the word food in terms of emissions.
        - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

        The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

        Based on the information about the topic above, please create a short English label of this topic. Make sure you to only return the label and nothing more.

        [/INST] Environmental impacts of eating meat
        """

HOVER_TEXT_TEMPLATE = """
<div style="background-color:#eeeeeeff;">
    <p style="color:#000000;">Document: {id}, page {page}<br/>Date: {isodate}</p>
    <p style="color:#ff0000;">Topic: {label}</p>
    <p style="color:#000000;">Text:</br> {text}</p>
</div>
"""



[docs]
class Plotter:
    """Run topic model and plotting."""

    def __init__(
        self,
        data_path: str,
        out_path: str,
        year_range: tuple[int, int],
        embedding_model: str = "google/embeddinggemma-300m",
        topicname_llm_model: str = "llama4:scout",
    ) -> None:
        self.out_path = Path(out_path)
        self.embedding_model = embedding_model
        self.topicLLM = topicname_llm_model

        self.data = pd.read_json(data_path, lines=True)
        self.data = self.data.query(
            f"{year_range[0]} <= year < {year_range[1]}"
        ).reset_index(drop=True)
        isodate = self.data.date.apply(lambda x: x.isoformat())
        self.data.insert(0, "isodate", isodate)
        self.text = self.data["text"].to_numpy()
        self.year_range = year_range
        if self.data.shape[0] == 0:
            raise ValueError("Dataset is empty for chosen time range.")
        with open(self.out_path / "parameters.json") as infile:
            data = json.load(infile)

        data.update(
            {
                "year_range": self.year_range,
                "embeddings": embedding_model,
                "topic_llm": topicname_llm_model,
            }
        )
        with open(self.out_path / "parameters.json", "w") as parameter_file:
            json.dump(data, parameter_file)

    def _embeddings(self, load=False) -> None:
        """Generate text embeddings.

        For a re-fit set load = True to load
        existing embeddings."""
        device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"
        embeddingspath = (
            self.out_path
            / f"polona_embeddings_{self.year_range[0]}-{self.year_range[1]}.npy"
        )
        self.sentenceEmbedder = SentenceTransformer(
            self.embedding_model, device=device, trust_remote_code=True
        )
        if not load:
            self.embeddings = self.sentenceEmbedder.encode(
                self.text,
                device=device,
                normalize_embeddings=True,
                show_progress_bar=True,
            )
            np.save(
                embeddingspath,
                self.embeddings,
            )
        elif load:
            self.embeddings = np.load(embeddingspath)

    def _setup_topic_model(self):
        """Setup topic modeling parameters.

        Change parameters here to influence
        number of found topics. Refere to
        BERTopic documentation."""
        umap_model = UMAP(
            n_neighbors=40,
            n_components=25,
            min_dist=0.0,
            metric="cosine",
            random_state=42,
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=50, min_samples=5, prediction_data=False
        )
        representation_model = MaximalMarginalRelevance(diversity=0.5)

        self.topic_model = BERTopic(
            embedding_model=self.sentenceEmbedder,
            min_topic_size=100,
            top_n_words=5,
            verbose=True,
            calculate_probabilities=False,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            representation_model=representation_model,
        )


[docs]
    def fit_model(self) -> pd.DataFrame:
        """Fit BERTopic to text data."""
        topics, probs = self.topic_model.fit_transform(
            self.text, embeddings=self.embeddings
        )
        docs_topics = self.topic_model.get_document_info(self.text)
        return docs_topics



[docs]
    def get_topic_names(self, docs_topics: pd.DataFrame, min_size=100) -> None:
        """Use OLLAMA to generate topic names."""
        topic_names = {}
        for topic_id, group in docs_topics.groupby("Topic"):
            if topic_id == -1:
                continue
            if group.shape[0] < min_size:
                continue

            n_samples = min(50, len(group))
            sampled = group.sample(n_samples)

            keywords = group["Representation"].iloc[0]
            sample_texts = sampled["Document"].tolist()

            prompt = (
                SYSTEM_PROMPT
                + EXAMPLE_PROMPT
                + f"""
                [INST]
                I have a topic that contains the following documents:
                {chr(10).join(f"{i + 1}. {text}" for i, text in enumerate(sample_texts))}

                The topic is described by the following keywords: '{keywords}'.

                Based on the information about the topic above, please create a
                short label of this topic in English.
                Make sure you to only return the label and nothing more.
                [/INST]
                """
            )
            payload = {
                "model": self.topicLLM,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.3,
                    "num_predict": 100,
                    "num_ctx": 2048,
                    "stop": ["Q:", "\n"],
                },
            }
            response = requests.post(OLLAMA_URL, json=payload, timeout=60)
            response.raise_for_status()
            topic_name = response.json()["response"].strip()
            topic_names[topic_id] = topic_name

        topic_df = pd.DataFrame(
            list(topic_names.items()), columns=["topic", "llm_topic_name"]
        )

        topic_df.to_json(
            self.out_path
            / f"topics_by_llm_{self.year_range[0]}-{self.year_range[1]}.json",
            lines=True,
            orient="records",
        )

        # Exclude randomly long texts as topic titles.
        topic2llm_vInt = {int(x): y for x, y in topic_names.items() if len(y) < 100}
        self.topic_model.set_topic_labels(topic2llm_vInt)

        self.topic_model.save(
            self.out_path / f"model_{self.year_range[0]}-{self.year_range[1]}/",
            serialization="safetensors",
            save_ctfidf=True,
            save_embedding_model=self.sentenceEmbedder,
        )


    def _createLabels(self, row: dict) -> str:
        if row["Topic"] < 0:
            return "Unlabelled"
        return row["CustomName"]


[docs]
    def plot_map(self) -> None:
        """Generate datamap plot of topic model."""

        topicLabelDF = self.topic_model.get_document_info(self.text)
        labels = topicLabelDF.apply(lambda x: self._createLabels(x), axis=1)
        extra_data = self.data[
            [
                "id",
                "page",
                "academica_id",
                "isodate",
                "text",
            ]
        ]
        pages = extra.page.apply(lambda x: x + 1)
        extra_data.drop("page", axis=1)
        extra_data.insert(0, "page", pages)
        extra_data.insert(0, "label", labels)
        fig2 = self.topic_model.visualize_document_datamap(
            self.text,
            custom_labels=True,
            embeddings=self.embeddings,
            interactive=True,
            title=f"Polona Corpus: {self.year_range[0]} - {self.year_range[1]}",
            enable_search=True,
            int_datamap_kwds={
                "cluster_boundary_polygons": False,
                "cluster_boundary_line_width": 2,
                "darkmode": False,
                "extra_point_data": extra_data,
                "on_click": f"window.open(`https://academica.edu.pl/reading/readSingle?page={page}&uid={academica_id}`)",
                "hover_text_html_template": HOVER_TEXT_TEMPLATE,
                "histogram_data": self.data.date,
                "histogram_group_datetime_by": "quarter",
                "histogram_range": (
                    pd.to_datetime(f"{self.year_range[0]}-01-01"),
                    pd.to_datetime(f"{self.year_range[1]}-01-01"),
                ),
            },
        )
        fig2.save(
            self.out_path / f"polona_{self.year_range[0]}-{self.year_range[1]}.html"
        )



[docs]
    def plot(self, load: bool = False) -> None:
        """Run embedding, topic modelling and plotting."""
        self._embeddings(load=load)
        self._setup_topic_model()
        docs_topics = self.fit_model()
        self.get_topic_names(docs_topics=docs_topics, min_size=100)
        self.plot_map()