import random from collections import defaultdict from typing import Optional try: import spacy nlp = spacy.load('de_core_news_sm') except ImportError: print("Spacy model not found. Please run: python -m spacy download de_core_news_sm") raise def tokenize_text(text: str) -> list[str]: doc = nlp(text) tokens = [] for token in doc: if not token.is_space and not token.is_punct: tokens.append(token.text) return tokens def build_markov_chain(texts: list[str], order: int = 2) -> dict: chain = defaultdict(list) for text in texts: tokens = tokenize_text(text) if len(tokens) <= order: continue for i in range(len(tokens) - order): key = tuple(tokens[i:i + order]) next_word = tokens[i + order] chain[key].append(next_word) return chain def generate_markov_sentence(chain: dict, max_length: int = 30, start_length: int = 2) -> Optional[str]: if not chain: return None start_keys = [k for k in chain.keys() if k[0][0].isupper() or k[0].isupper()] if not start_keys: start_keys = list(chain.keys()) current = random.choice(start_keys) words = list(current) while len(words) < max_length: key = tuple(words[-start_length:]) if key not in chain: break next_word = random.choice(chain[key]) words.append(next_word) if next_word in '.!?': break sentence = ' '.join(words) for punct in '.!?': if punct in sentence: sentence = sentence.split(punct)[0] + punct break return sentence def process_texts_for_markov(texts: list[str], order: int = 2) -> dict: chain = build_markov_chain(texts, order) return chain