python - LSTM-Based Stock Price Prediction: Modeling, Evaluation, and Independence from Test Data

I have a problem with this code. The prediction seems not to work without access to the test data. Normally, the prediction should be performed independently of the test data. I tested it with the test data earlier, and the result seemed too good to me. For this reason, I decided to manipulate the test data in the CSV file. They do not match the real values. I thought that if the prediction runs completely independently of the test data, the prediction would not change. Unfortunately, the prediction adjusted exactly to the manipulated test data with this modified file. This shows me that the code cannot be correct.

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.optimizers import Adam
import numpy as np

# 1. Read and prepare the CSV file
df = pd.read_csv('./Schlusspreise_2000-2009_manipuliert.csv', header=None, names=['Date', 'Price'])
df['Date'] = pd.to_datetime(df['Date'])
df['Price'] = df['Price'].astype(float)
df.set_index('Date', inplace=True)

# 2. Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df[['Price']])

# 3. Split into training, validation, and test sets
train_size = int(len(df_scaled) * 0.8)
val_size = int(len(df_scaled) * 0.1)

train_data = df_scaled[:train_size]
val_data = df_scaled[train_size:train_size + val_size]
test_data = df_scaled[train_size + val_size:]

# Prepare training data
x_train, y_train = [], []
for i in range(60, len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Prepare validation data
inputs_val = np.concatenate((train_data[-60:], val_data), axis=0)
x_val, y_val = [], []
for i in range(60, len(inputs_val)):
    x_val.append(inputs_val[i-60:i, 0])
    y_val.append(inputs_val[i, 0])

x_val, y_val = np.array(x_val), np.array(y_val)
x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))

# Prepare test data
inputs_test = np.concatenate((val_data[-60:], test_data), axis=0)
x_test, y_test = [], []
for i in range(60, len(inputs_test)):
    x_test.append(inputs_test[i-60:i, 0])
    y_test.append(inputs_test[i, 0])

x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

# 4. Create LSTM model
lstm_model = Sequential([
    LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)),
    Dropout(0.2),
    LSTM(units=50),
    Dropout(0.2),
    Dense(1)
])
lstm_modelpile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.0015))

# 5. Train the model
history = lstm_model.fit(
    x_train, y_train,
    epochs=4,
    batch_size=32,
    verbose=2,
    validation_data=(x_val, y_val)
)

# 6. Make predictions
train_predictions = lstm_model.predict(x_train)
train_predictions = scaler.inverse_transform(train_predictions)

val_predictions = lstm_model.predict(x_val)
val_predictions = scaler.inverse_transform(val_predictions)

test_predictions = lstm_model.predict(x_test)
test_predictions = scaler.inverse_transform(test_predictions)

# Calculate error metrics
train_actual = scaler.inverse_transform(train_data[60:])
train_rmse = np.sqrt(mean_squared_error(train_actual, train_predictions))

val_actual = scaler.inverse_transform(val_data)
val_rmse = np.sqrt(mean_squared_error(val_actual[:len(val_predictions)], val_predictions))

test_actual = scaler.inverse_transform(test_data)
test_rmse = np.sqrt(mean_squared_error(test_actual[:len(test_predictions)], test_predictions))

# Visualize train_loss and val_loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.title('Train vs Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')

# Set x-axis for each epoch
plt.xticks(ticks=range(len(history.history['loss'])), labels=range(1, len(history.history['loss']) + 1))

plt.legend()
plt.grid()
plt.show()

# Visualize training and validation data
plt.figure(figsize=(14, 7))
plt.plot(df.index[:train_size], df['Price'][:train_size], label="Training Data", color="blue")
plt.plot(df.index[train_size:train_size + val_size], df['Price'][train_size:train_size + val_size], label="Validation Data", color="purple")
plt.plot(df.index[60:train_size], train_predictions, label="Training Prediction", color="orange")
plt.plot(df.index[train_size:train_size + val_size], val_predictions, label="Validation Prediction", color="brown")
plt.xlabel('Date')
plt.ylabel('Closing Price')

# Calculate RMSE for training and validation
train_rmse = np.sqrt(mean_squared_error(train_actual, train_predictions))
val_rmse = np.sqrt(mean_squared_error(val_actual[:len(val_predictions)], val_predictions))

# Title with RMSE
plt.title(f'Training and Validation Data with Predictions for Apple 01.01.2004 – 26.05.2006 \nTrain RMSE: {train_rmse:.2f}, Val RMSE: {val_rmse:.2f}')
plt.legend()
plt.grid()
plt.show()

# Visualization
plt.figure(figsize=(14, 7))
plt.plot(df.index[:train_size], df['Price'][:train_size], label="Training Data", color="blue")
plt.plot(df.index[train_size:train_size + val_size], df['Price'][train_size:train_size + val_size], label="Validation Data", color="purple")
plt.plot(df.index[train_size + val_size:], df['Price'][train_size + val_size:], label="Test Data", color="green")
plt.plot(df.index[60:train_size], train_predictions, label="Training Prediction", color="orange")
plt.plot(df.index[train_size:train_size + val_size], val_predictions, label="Validation Prediction", color="brown")
plt.plot(df.index[train_size + val_size:], test_predictions, label="Prediction on Unseen Test Data", color="red")
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title(f'LSTM Prediction - AAPL 01.01.2004 - 31.12.2006 \nTrain RMSE: {train_rmse:.2f}, Val RMSE: {val_rmse:.2f}, Test RMSE: {test_rmse:.2f}')
plt.legend()
plt.grid()
plt.show()

I would be very grateful for any support. Could you explain what could be the problem? I am writing my master's thesis about this.

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - LSTM-Based Stock Price Prediction: Modeling, Evaluation, and Independence from Test Data - Stack Overflow

与本文相关的文章

评论列表(0)