from spellchecker import SpellChecker
import re
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load and preprocess the dataset
with open('/content/Datasetinput.txt', 'r') as file:
    data = file.readlines()

# Create a spell checker object
spell_checker = SpellChecker()

# Function to detect and correct spelling errors in a sentence
def correct_spelling(sentence, spell_checker):
    # Split the sentence into words
    words = re.findall(r'\w+', sentence)
    
    # Correct the spelling of each word
    corrected_words = [spell_checker.correction(word) for word in words]
    
    # Reconstruct the sentence with corrected words
    corrected_sentence = ' '.join(corrected_words)
    
    return corrected_sentence

# Correct the spelling errors in each sentence
corrected_data = [correct_spelling(sentence, spell_checker) for sentence in data]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corrected_data)
sequences = tokenizer.texts_to_sequences(corrected_data)
max_sequence_length = max([len(seq) for seq in sequences])
data_processed = pad_sequences(sequences, maxlen=max_sequence_length)

# Define the autoencoder model
model_autoencoder = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_length),
    SimpleRNN(units=128, return_sequences=True),
    Dense(len(tokenizer.word_index)+1, activation='softmax')
])
model_autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the autoencoder model
model_autoencoder.fit(data_processed, data_processed, epochs=10, batch_size=32, validation_split=0.2)

# Use the trained autoencoder for feature extraction
encoder = Sequential(model_autoencoder.layers[:-1])  # Remove the output layer
encoded_data = encoder.predict(data_processed)

# Define and train the SimpleRNN model using the encoded data
model_rnn = Sequential([
    SimpleRNN(units=128, return_sequences=True),
    Dense(len(tokenizer.word_index)+1, activation='softmax')
])
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(encoded_data, data_processed, epochs=10, batch_size=32, validation_split=0.2)

# Define and train the Bi-LSTM model using the encoded data
model_bilstm = Sequential([
    Bidirectional(LSTM(units=128, return_sequences=True)),
    Dense(len(tokenizer.word_index)+1, activation='softmax')
])
model_bilstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_bilstm.fit(encoded_data, data_processed, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the models
loss_rnn, accuracy_rnn = model_rnn.evaluate(encoded_data, data_processed)
loss_bilstm, accuracy_bilstm = model_bilstm.evaluate(encoded_data, data_processed)

print(f"SimpleRNN - Loss: {loss_rnn}, Accuracy: {accuracy_rnn}")
print(f"Bi-LSTM - Loss: {loss_bilstm}, Accuracy: {accuracy_bilstm}")