from spellchecker import SpellChecker import re import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, LSTM, Dense from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # Load and preprocess the dataset with open('/content/Datasetinput.txt', 'r') as file: data = file.readlines() # Create a spell checker object spell_checker = SpellChecker() # Function to detect and correct spelling errors in a sentence def correct_spelling(sentence, spell_checker): # Split the sentence into words words = re.findall(r'\w+', sentence) # Correct the spelling of each word corrected_words = [spell_checker.correction(word) for word in words] # Reconstruct the sentence with corrected words corrected_sentence = ' '.join(corrected_words) return corrected_sentence # Correct the spelling errors in each sentence corrected_data = [correct_spelling(sentence, spell_checker) for sentence in data] tokenizer = Tokenizer() tokenizer.fit_on_texts(corrected_data) sequences = tokenizer.texts_to_sequences(corrected_data) max_sequence_length = max([len(seq) for seq in sequences]) data_processed = pad_sequences(sequences, maxlen=max_sequence_length) # Define the autoencoder model model_autoencoder = Sequential([ Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_length), SimpleRNN(units=128, return_sequences=True), Dense(len(tokenizer.word_index)+1, activation='softmax') ]) model_autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the autoencoder model model_autoencoder.fit(data_processed, data_processed, epochs=10, batch_size=32, validation_split=0.2) # Use the trained autoencoder for feature extraction encoder = Sequential(model_autoencoder.layers[:-1]) # Remove the output layer encoded_data = encoder.predict(data_processed) # Define and train the SimpleRNN model using the encoded data model_rnn = Sequential([ SimpleRNN(units=128, return_sequences=True), Dense(len(tokenizer.word_index)+1, activation='softmax') ]) model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model_rnn.fit(encoded_data, data_processed, epochs=10, batch_size=32, validation_split=0.2) # Define and train the Bi-LSTM model using the encoded data model_bilstm = Sequential([ Bidirectional(LSTM(units=128, return_sequences=True)), Dense(len(tokenizer.word_index)+1, activation='softmax') ]) model_bilstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model_bilstm.fit(encoded_data, data_processed, epochs=10, batch_size=32, validation_split=0.2) # Evaluate the models loss_rnn, accuracy_rnn = model_rnn.evaluate(encoded_data, data_processed) loss_bilstm, accuracy_bilstm = model_bilstm.evaluate(encoded_data, data_processed) print(f"SimpleRNN - Loss: {loss_rnn}, Accuracy: {accuracy_rnn}") print(f"Bi-LSTM - Loss: {loss_bilstm}, Accuracy: {accuracy_bilstm}")