73 lines
1.8 KiB
Python
73 lines
1.8 KiB
Python
import random
|
|
from collections import defaultdict
|
|
from typing import Optional
|
|
|
|
try:
|
|
import spacy
|
|
nlp = spacy.load('de_core_news_sm')
|
|
except ImportError:
|
|
print("Spacy model not found. Please run: python -m spacy download de_core_news_sm")
|
|
raise
|
|
|
|
|
|
def tokenize_text(text: str) -> list[str]:
|
|
doc = nlp(text)
|
|
tokens = []
|
|
for token in doc:
|
|
if not token.is_space and not token.is_punct:
|
|
tokens.append(token.text)
|
|
return tokens
|
|
|
|
|
|
def build_markov_chain(texts: list[str], order: int = 2) -> dict:
|
|
chain = defaultdict(list)
|
|
|
|
for text in texts:
|
|
tokens = tokenize_text(text)
|
|
if len(tokens) <= order:
|
|
continue
|
|
|
|
for i in range(len(tokens) - order):
|
|
key = tuple(tokens[i:i + order])
|
|
next_word = tokens[i + order]
|
|
chain[key].append(next_word)
|
|
|
|
return chain
|
|
|
|
|
|
def generate_markov_sentence(chain: dict, max_length: int = 30, start_length: int = 2) -> Optional[str]:
|
|
if not chain:
|
|
return None
|
|
|
|
start_keys = [k for k in chain.keys() if k[0][0].isupper() or k[0].isupper()]
|
|
if not start_keys:
|
|
start_keys = list(chain.keys())
|
|
|
|
current = random.choice(start_keys)
|
|
words = list(current)
|
|
|
|
while len(words) < max_length:
|
|
key = tuple(words[-start_length:])
|
|
if key not in chain:
|
|
break
|
|
|
|
next_word = random.choice(chain[key])
|
|
words.append(next_word)
|
|
|
|
if next_word in '.!?':
|
|
break
|
|
|
|
sentence = ' '.join(words)
|
|
|
|
for punct in '.!?':
|
|
if punct in sentence:
|
|
sentence = sentence.split(punct)[0] + punct
|
|
break
|
|
|
|
return sentence
|
|
|
|
|
|
def process_texts_for_markov(texts: list[str], order: int = 2) -> dict:
|
|
chain = build_markov_chain(texts, order)
|
|
return chain
|