"""Generates topic model maps for text containing target words."""
#!/usr/bin/env python
# coding: utf-8
import json
import logging
from pathlib import Path
import numpy as np
import pandas as pd
import requests
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
from sentence_transformers import SentenceTransformer
from torch import cuda
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler()],
)
logger = logging.getLogger(__name__)
OLLAMA_URL = "http://localhost:11434/api/generate"
SYSTEM_PROMPT = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
EXAMPLE_PROMPT = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short English label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""
HOVER_TEXT_TEMPLATE = """
<div style="background-color:#eeeeeeff;">
<p style="color:#000000;">Document: {id}, page {page}<br/>Date: {isodate}</p>
<p style="color:#ff0000;">Topic: {label}</p>
<p style="color:#000000;">Text:</br> {text}</p>
</div>
"""
[docs]
class Plotter:
"""Run topic model and plotting."""
def __init__(
self,
data_path: str,
out_path: str,
year_range: tuple[int, int],
embedding_model: str = "google/embeddinggemma-300m",
topicname_llm_model: str = "llama4:scout",
) -> None:
self.out_path = Path(out_path)
self.embedding_model = embedding_model
self.topicLLM = topicname_llm_model
self.data = pd.read_json(data_path, lines=True)
self.data = self.data.query(
f"{year_range[0]} <= year < {year_range[1]}"
).reset_index(drop=True)
isodate = self.data.date.apply(lambda x: x.isoformat())
self.data.insert(0, "isodate", isodate)
self.text = self.data["text"].to_numpy()
self.year_range = year_range
if self.data.shape[0] == 0:
raise ValueError("Dataset is empty for chosen time range.")
with open(self.out_path / "parameters.json") as infile:
data = json.load(infile)
data.update(
{
"year_range": self.year_range,
"embeddings": embedding_model,
"topic_llm": topicname_llm_model,
}
)
with open(self.out_path / "parameters.json", "w") as parameter_file:
json.dump(data, parameter_file)
def _embeddings(self, load=False) -> None:
"""Generate text embeddings.
For a re-fit set load = True to load
existing embeddings."""
device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"
embeddingspath = (
self.out_path
/ f"polona_embeddings_{self.year_range[0]}-{self.year_range[1]}.npy"
)
self.sentenceEmbedder = SentenceTransformer(
self.embedding_model, device=device, trust_remote_code=True
)
if not load:
self.embeddings = self.sentenceEmbedder.encode(
self.text,
device=device,
normalize_embeddings=True,
show_progress_bar=True,
)
np.save(
embeddingspath,
self.embeddings,
)
elif load:
self.embeddings = np.load(embeddingspath)
def _setup_topic_model(self):
"""Setup topic modeling parameters.
Change parameters here to influence
number of found topics. Refere to
BERTopic documentation."""
umap_model = UMAP(
n_neighbors=40,
n_components=25,
min_dist=0.0,
metric="cosine",
random_state=42,
)
hdbscan_model = HDBSCAN(
min_cluster_size=50, min_samples=5, prediction_data=False
)
representation_model = MaximalMarginalRelevance(diversity=0.5)
self.topic_model = BERTopic(
embedding_model=self.sentenceEmbedder,
min_topic_size=100,
top_n_words=5,
verbose=True,
calculate_probabilities=False,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
)
[docs]
def fit_model(self) -> pd.DataFrame:
"""Fit BERTopic to text data."""
topics, probs = self.topic_model.fit_transform(
self.text, embeddings=self.embeddings
)
docs_topics = self.topic_model.get_document_info(self.text)
return docs_topics
[docs]
def get_topic_names(self, docs_topics: pd.DataFrame, min_size=100) -> None:
"""Use OLLAMA to generate topic names."""
topic_names = {}
for topic_id, group in docs_topics.groupby("Topic"):
if topic_id == -1:
continue
if group.shape[0] < min_size:
continue
n_samples = min(50, len(group))
sampled = group.sample(n_samples)
keywords = group["Representation"].iloc[0]
sample_texts = sampled["Document"].tolist()
prompt = (
SYSTEM_PROMPT
+ EXAMPLE_PROMPT
+ f"""
[INST]
I have a topic that contains the following documents:
{chr(10).join(f"{i + 1}. {text}" for i, text in enumerate(sample_texts))}
The topic is described by the following keywords: '{keywords}'.
Based on the information about the topic above, please create a
short label of this topic in English.
Make sure you to only return the label and nothing more.
[/INST]
"""
)
payload = {
"model": self.topicLLM,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.3,
"num_predict": 100,
"num_ctx": 2048,
"stop": ["Q:", "\n"],
},
}
response = requests.post(OLLAMA_URL, json=payload, timeout=60)
response.raise_for_status()
topic_name = response.json()["response"].strip()
topic_names[topic_id] = topic_name
topic_df = pd.DataFrame(
list(topic_names.items()), columns=["topic", "llm_topic_name"]
)
topic_df.to_json(
self.out_path
/ f"topics_by_llm_{self.year_range[0]}-{self.year_range[1]}.json",
lines=True,
orient="records",
)
# Exclude randomly long texts as topic titles.
topic2llm_vInt = {int(x): y for x, y in topic_names.items() if len(y) < 100}
self.topic_model.set_topic_labels(topic2llm_vInt)
self.topic_model.save(
self.out_path / f"model_{self.year_range[0]}-{self.year_range[1]}/",
serialization="safetensors",
save_ctfidf=True,
save_embedding_model=self.sentenceEmbedder,
)
def _createLabels(self, row: dict) -> str:
if row["Topic"] < 0:
return "Unlabelled"
return row["CustomName"]
[docs]
def plot_map(self) -> None:
"""Generate datamap plot of topic model."""
topicLabelDF = self.topic_model.get_document_info(self.text)
labels = topicLabelDF.apply(lambda x: self._createLabels(x), axis=1)
extra_data = self.data[
[
"id",
"page",
"academica_id",
"isodate",
"text",
]
]
pages = extra.page.apply(lambda x: x + 1)
extra_data.drop("page", axis=1)
extra_data.insert(0, "page", pages)
extra_data.insert(0, "label", labels)
fig2 = self.topic_model.visualize_document_datamap(
self.text,
custom_labels=True,
embeddings=self.embeddings,
interactive=True,
title=f"Polona Corpus: {self.year_range[0]} - {self.year_range[1]}",
enable_search=True,
int_datamap_kwds={
"cluster_boundary_polygons": False,
"cluster_boundary_line_width": 2,
"darkmode": False,
"extra_point_data": extra_data,
"on_click": f"window.open(`https://academica.edu.pl/reading/readSingle?page={page}&uid={academica_id}`)",
"hover_text_html_template": HOVER_TEXT_TEMPLATE,
"histogram_data": self.data.date,
"histogram_group_datetime_by": "quarter",
"histogram_range": (
pd.to_datetime(f"{self.year_range[0]}-01-01"),
pd.to_datetime(f"{self.year_range[1]}-01-01"),
),
},
)
fig2.save(
self.out_path / f"polona_{self.year_range[0]}-{self.year_range[1]}.html"
)
[docs]
def plot(self, load: bool = False) -> None:
"""Run embedding, topic modelling and plotting."""
self._embeddings(load=load)
self._setup_topic_model()
docs_topics = self.fit_model()
self.get_topic_names(docs_topics=docs_topics, min_size=100)
self.plot_map()