ulfbot/markov.py

import random
from collections import defaultdict
from typing import Optional

try:
    import spacy
    nlp = spacy.load('de_core_news_sm')
except ImportError:
    print("Spacy model not found. Please run: python -m spacy download de_core_news_sm")
    raise


def tokenize_text(text: str) -> list[str]:
    doc = nlp(text)
    tokens = []
    for token in doc:
        if not token.is_space and not token.is_punct:
            tokens.append(token.text)
    return tokens


def build_markov_chain(texts: list[str], order: int = 2) -> dict:
    chain = defaultdict(list)

    for text in texts:
        tokens = tokenize_text(text)
        if len(tokens) <= order:
            continue

        for i in range(len(tokens) - order):
            key = tuple(tokens[i:i + order])
            next_word = tokens[i + order]
            chain[key].append(next_word)

    return chain


def generate_markov_sentence(chain: dict, max_length: int = 30, start_length: int = 2) -> Optional[str]:
    if not chain:
        return None

    start_keys = [k for k in chain.keys() if k[0][0].isupper() or k[0].isupper()]
    if not start_keys:
        start_keys = list(chain.keys())

    current = random.choice(start_keys)
    words = list(current)

    while len(words) < max_length:
        key = tuple(words[-start_length:])
        if key not in chain:
            break

        next_word = random.choice(chain[key])
        words.append(next_word)

        if next_word in '.!?':
            break

    sentence = ' '.join(words)

    for punct in '.!?':
        if punct in sentence:
            sentence = sentence.split(punct)[0] + punct
            break

    return sentence


def process_texts_for_markov(texts: list[str], order: int = 2) -> dict:
    chain = build_markov_chain(texts, order)
    return chain