Generating missing data and evaluating missing data analysis in Python

Generating missing values

Generating missing values with a given percentage of missingness for a dataframe or numpy array:

import numpy as np

def generate_missing_data(df, percentage):
    # input: DataFrame
    # Create a binary mask with the same shape as df, with True values 
    # at locations we want to make missing, and False values elsewhere
    mask = np.random.rand(*df.shape) < percentage
    
    # Create a copy of the DataFrame so we don't modify the original data
    df_missing = df.copy()
    
    # Apply the mask to the DataFrame, replacing True values with NaN
    df_missing[mask] = np.nan
    
    return df_missing

Generating missing values with a given missing rate for a time series list:

import numpy as np
def generate_missing_data_list(time_series_list, missing_rate=0.1):
    """
    Generate missing data in time series dataframes

    Parameters:
    time_series_list (list of pd.DataFrame): List of time series dataframes
    missing_rate (float): Proportion of data to be set as missing

    Returns:
    list of pd.DataFrame: List of time series dataframes with missing data
    """

    for i in range(len(time_series_list)):
        # Get the number of missing values to generate
        missing_values_count = int(missing_rate * len(time_series_list[i]))

        # Generate random indices for missing values
        missing_indices = np.random.choice(len(time_series_list[i]), missing_values_count, replace=False)

        # Set the value at the missing indices to NaN
        time_series_list[i].iloc[missing_indices, 0] = np.nan

    return time_series_list

Calculating MSE ignoring missing values

import time
import numpy as np

def mse_ignore_na(arr1, arr2):
    # compute the MSE, ignoring missing entries in the original data
    # Mask for non-missing values in both arrays
    valid_mask = np.logical_and(~np.isnan(arr1), ~np.isnan(arr2))
    
    # Compute MSE only for valid values
    mse = np.mean((arr1[valid_mask] - arr2[valid_mask]) ** 2)
    
    return mse


Discover more from Science Comics

Subscribe to get the latest posts sent to your email.

Leave a Reply

error: Content is protected !!