Generating missing values
Generating missing values with a given percentage of missingness for a dataframe or numpy array:
import numpy as np
def generate_missing_data(df, percentage):
# input: DataFrame
# Create a binary mask with the same shape as df, with True values
# at locations we want to make missing, and False values elsewhere
mask = np.random.rand(*df.shape) < percentage
# Create a copy of the DataFrame so we don't modify the original data
df_missing = df.copy()
# Apply the mask to the DataFrame, replacing True values with NaN
df_missing[mask] = np.nan
return df_missing
Generating missing values with a given missing rate for a time series list:
import numpy as np
def generate_missing_data_list(time_series_list, missing_rate=0.1):
"""
Generate missing data in time series dataframes
Parameters:
time_series_list (list of pd.DataFrame): List of time series dataframes
missing_rate (float): Proportion of data to be set as missing
Returns:
list of pd.DataFrame: List of time series dataframes with missing data
"""
for i in range(len(time_series_list)):
# Get the number of missing values to generate
missing_values_count = int(missing_rate * len(time_series_list[i]))
# Generate random indices for missing values
missing_indices = np.random.choice(len(time_series_list[i]), missing_values_count, replace=False)
# Set the value at the missing indices to NaN
time_series_list[i].iloc[missing_indices, 0] = np.nan
return time_series_list
Calculating MSE ignoring missing values
import time
import numpy as np
def mse_ignore_na(arr1, arr2):
# compute the MSE, ignoring missing entries in the original data
# Mask for non-missing values in both arrays
valid_mask = np.logical_and(~np.isnan(arr1), ~np.isnan(arr2))
# Compute MSE only for valid values
mse = np.mean((arr1[valid_mask] - arr2[valid_mask]) ** 2)
return mse
Discover more from Science Comics
Subscribe to get the latest posts sent to your email.