Files
ulfbot/markov.py
2026-03-15 09:12:21 +01:00

73 lines
1.8 KiB
Python

import random
from collections import defaultdict
from typing import Optional
try:
import spacy
nlp = spacy.load('de_core_news_sm')
except ImportError:
print("Spacy model not found. Please run: python -m spacy download de_core_news_sm")
raise
def tokenize_text(text: str) -> list[str]:
doc = nlp(text)
tokens = []
for token in doc:
if not token.is_space and not token.is_punct:
tokens.append(token.text)
return tokens
def build_markov_chain(texts: list[str], order: int = 2) -> dict:
chain = defaultdict(list)
for text in texts:
tokens = tokenize_text(text)
if len(tokens) <= order:
continue
for i in range(len(tokens) - order):
key = tuple(tokens[i:i + order])
next_word = tokens[i + order]
chain[key].append(next_word)
return chain
def generate_markov_sentence(chain: dict, max_length: int = 30, start_length: int = 2) -> Optional[str]:
if not chain:
return None
start_keys = [k for k in chain.keys() if k[0][0].isupper() or k[0].isupper()]
if not start_keys:
start_keys = list(chain.keys())
current = random.choice(start_keys)
words = list(current)
while len(words) < max_length:
key = tuple(words[-start_length:])
if key not in chain:
break
next_word = random.choice(chain[key])
words.append(next_word)
if next_word in '.!?':
break
sentence = ' '.join(words)
for punct in '.!?':
if punct in sentence:
sentence = sentence.split(punct)[0] + punct
break
return sentence
def process_texts_for_markov(texts: list[str], order: int = 2) -> dict:
chain = build_markov_chain(texts, order)
return chain