Code erstellt von OpenCode.ai
This commit is contained in:
72
markov.py
Normal file
72
markov.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import random
|
||||
from collections import defaultdict
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import spacy
|
||||
nlp = spacy.load('de_core_news_sm')
|
||||
except ImportError:
|
||||
print("Spacy model not found. Please run: python -m spacy download de_core_news_sm")
|
||||
raise
|
||||
|
||||
|
||||
def tokenize_text(text: str) -> list[str]:
|
||||
doc = nlp(text)
|
||||
tokens = []
|
||||
for token in doc:
|
||||
if not token.is_space and not token.is_punct:
|
||||
tokens.append(token.text)
|
||||
return tokens
|
||||
|
||||
|
||||
def build_markov_chain(texts: list[str], order: int = 2) -> dict:
|
||||
chain = defaultdict(list)
|
||||
|
||||
for text in texts:
|
||||
tokens = tokenize_text(text)
|
||||
if len(tokens) <= order:
|
||||
continue
|
||||
|
||||
for i in range(len(tokens) - order):
|
||||
key = tuple(tokens[i:i + order])
|
||||
next_word = tokens[i + order]
|
||||
chain[key].append(next_word)
|
||||
|
||||
return chain
|
||||
|
||||
|
||||
def generate_markov_sentence(chain: dict, max_length: int = 30, start_length: int = 2) -> Optional[str]:
|
||||
if not chain:
|
||||
return None
|
||||
|
||||
start_keys = [k for k in chain.keys() if k[0][0].isupper() or k[0].isupper()]
|
||||
if not start_keys:
|
||||
start_keys = list(chain.keys())
|
||||
|
||||
current = random.choice(start_keys)
|
||||
words = list(current)
|
||||
|
||||
while len(words) < max_length:
|
||||
key = tuple(words[-start_length:])
|
||||
if key not in chain:
|
||||
break
|
||||
|
||||
next_word = random.choice(chain[key])
|
||||
words.append(next_word)
|
||||
|
||||
if next_word in '.!?':
|
||||
break
|
||||
|
||||
sentence = ' '.join(words)
|
||||
|
||||
for punct in '.!?':
|
||||
if punct in sentence:
|
||||
sentence = sentence.split(punct)[0] + punct
|
||||
break
|
||||
|
||||
return sentence
|
||||
|
||||
|
||||
def process_texts_for_markov(texts: list[str], order: int = 2) -> dict:
|
||||
chain = build_markov_chain(texts, order)
|
||||
return chain
|
||||
Reference in New Issue
Block a user