diff --git a/imagery_scraping/download_imagery.py b/imagery_scraping/download_imagery.py index 60f34cc..a8fc7a5 100644 --- a/imagery_scraping/download_imagery.py +++ b/imagery_scraping/download_imagery.py @@ -220,7 +220,7 @@ def download_imagery(filepath, drive, year, sensor, range_km, rgb_only, parallel export_params = { - 'description': target_df[name_colname][i], + 'description': str(target_df[name_colname][i]), 'folder': drive, 'scale': resolution_m, # This is the resolution in meters 'region': region, diff --git a/modelling/dino/finetune_spatial.py b/modelling/dino/finetune_spatial.py index e712055..a963f7d 100644 --- a/modelling/dino/finetune_spatial.py +++ b/modelling/dino/finetune_spatial.py @@ -16,8 +16,55 @@ from sklearn.model_selection import train_test_split from torch.optim import Adam from torch.nn import L1Loss + + +""" +Finetuning the DinoV2 model using spatial data +This spatial data is stored in survey_processing/processed_data and is split into folds +View survey_processing/main.py for more information on this + +Dinov2's input is a RGB satellite image converted to a tensor +The target for each satellite image is several dhs variables (predict_target variable) +These variables once one-hot encoded form a larger dimension vector +The DinoV2 model outputs a 768 dimension vector, so we add an additional linear layer with sigmoid activation function +in order to get an output the size of our target vector + +After each epoch we save the models weights to our 'last model' file and if the error is sufficiently low - +we save it to 'best model' file also + +Satellite imagery is saved in the following file structure +Sub directories should be of the form country code + year + satellite +Filenames are the CENTROID_ID + +- imagery parent directory +--- ET2018S2 +------ ET2000000090.tif +------ ET2000000213.tif +------ ... +--- RW2018S2 +------ ... +--- ... +""" + + def main(fold, model_name, imagery_path, imagery_source, emb_size, batch_size, num_epochs): - + """ + Finetunes and validates Dinov2 model using one fold of data + Saves the last and the best model weights to file + + Parameters: + fold (integer): fold number + model_name (string): model name (i.e dinov2_vitb14) + imagery_path (string): parent directory of imagery + imagery_source (string): Landsat (L) or Sentinel (S) + emb_size (int): size of model output, default is 768 + batch_size (int): batch size + num_epochs (int): number of epochs + + Returns: + None + """ + if imagery_source == 'L': normalization = 30000. imagery_size = 336 @@ -28,24 +75,30 @@ def main(fold, model_name, imagery_path, imagery_source, emb_size, batch_size, n raise Exception("Unsupported imagery source") data_folder = r'survey_processing/processed_data' + # load preprocessed dhs data of the fold we are considering, we will take target columns from this train_df = pd.read_csv(f'{data_folder}/train_fold_{fold}.csv', index_col=0) test_df = pd.read_csv(f'{data_folder}/test_fold_{fold}.csv', index_col=0) + # store file paths of all available imagery in following list available_imagery = [] for d in os.listdir(imagery_path): + # d[-2] will either be S or L, refer to top comment to understand file structure of saved images if d[-2] == imagery_source: for f in os.listdir(os.path.join(imagery_path, d)): available_imagery.append(os.path.join(imagery_path, d, f)) + + # gets filename of each image without the .fileformat available_centroids = [f.split('/')[-1][:-4] for f in available_imagery] + # filter df to remove rows with no corresponding satellite image train_df = train_df[train_df['CENTROID_ID'].isin(available_centroids)] test_df = test_df[test_df['CENTROID_ID'].isin(available_centroids)] + def filter_contains(query): """ Returns a list of items that contain the given query substring. Parameters: - items (list of str): The list of strings to search within. query (str): The substring to search for in each item of the list. Returns: @@ -55,11 +108,17 @@ def filter_contains(query): for item in available_imagery: if query in item: return item + + + # add file path of satellite imagery corresponding to each row train_df['imagery_path'] = train_df['CENTROID_ID'].apply(filter_contains) test_df['imagery_path'] = test_df['CENTROID_ID'].apply(filter_contains) + # dhs variables to use as target data + # vaccination status, wealth index, height for age s.d, level of education, water access, sleeping arrangements etc predict_target = ['h10', 'h3', 'h31', 'h5', 'h7', 'h9', 'hc70', 'hv109', 'hv121', 'hv106', 'hv201', 'hv204', 'hv205', 'hv216', 'hv225', 'hv271', 'v312'] + # find one hot encoded columns associated with each of the categorical targets using regex filtered_predict_target = [] for col in predict_target: filtered_predict_target.extend( @@ -69,6 +128,7 @@ def filter_contains(query): train_df = train_df.dropna(subset=filtered_predict_target) predict_target = sorted(filtered_predict_target) + def load_and_preprocess_image(path): with rasterio.open(path) as src: # Read the specific bands (4, 3, 2 for RGB) @@ -85,6 +145,7 @@ def load_and_preprocess_image(path): return img.astype(np.uint8) # Convert to uint8 + def set_seed(seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -94,13 +155,20 @@ def set_seed(seed): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - # Set your desired seed + + # set your desired seed seed = 42 set_seed(seed) train, validation = train_test_split(train_df, test_size=0.2, random_state=42) + class CustomDataset(Dataset): + """ + Stores dataframe and transform (collection of image transforms) + When object is indexed, returns image_tensor, target + """ + def __init__(self, dataframe, transform): self.dataframe = dataframe self.transform = transform @@ -118,6 +186,8 @@ def __getitem__(self, idx): target = torch.tensor(item[predict_target], dtype=torch.float32) return image_tensor, target # Adjust based on actual output of feature_extractor + + # convert image to tensor of the correct size transform = transforms.Compose([ transforms.Resize((imagery_size, imagery_size)), # Resize the image to the input size expected by the model transforms.ToTensor(), # Convert the image to a PyTorch tensor @@ -132,6 +202,7 @@ def __getitem__(self, idx): base_model = torch.hub.load('facebookresearch/dinov2', model_name) + def save_checkpoint(model, optimizer, epoch, loss, filename="checkpoint.pth"): torch.save({ 'epoch': epoch, @@ -140,8 +211,16 @@ def save_checkpoint(model, optimizer, epoch, loss, filename="checkpoint.pth"): 'loss': loss }, filename) + torch.cuda.empty_cache() + + class ViTForRegression(nn.Module): + """ + Parent class is nn.Module (i.e DinoV2 model) + Adds additional linear layer with sigmoid activation function in order to get output of length len(predict_target) + """ + def __init__(self, base_model): super().__init__() self.base_model = base_model @@ -153,6 +232,8 @@ def forward(self, pixel_values): # We use the last hidden state return torch.sigmoid(self.regression_head(outputs)) + + # load last and best model for comparison of loss device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = ViTForRegression(base_model).to(device) best_model = f'modelling/dino/model/{model_name}_{fold}_all_cluster_best_{imagery_source}.pth' @@ -177,10 +258,12 @@ def forward(self, pixel_values): optimizer = torch.optim.Adam([base_model_params, head_params]) loss_fn = L1Loss() + # training and validation for epoch in range(epochs_ran+1, num_epochs): torch.cuda.empty_cache() model.train() print('Training...') + for batch in tqdm(train_loader): images, targets = batch images, targets = images.to(device), targets.to(device) @@ -193,7 +276,9 @@ def forward(self, pixel_values): optimizer.zero_grad() loss.backward() optimizer.step() + torch.cuda.empty_cache() + # Validation phase model.eval() val_loss = [] @@ -214,6 +299,7 @@ def forward(self, pixel_values): mean_val_loss = np.mean(val_loss) mean_indiv_loss = torch.stack(indiv_loss).mean(dim=0) + # save best and last model if appropriate if mean_val_loss< best_error: save_checkpoint(model, optimizer, epoch, mean_val_loss, filename=best_model) best_error = mean_val_loss @@ -221,7 +307,7 @@ def forward(self, pixel_values): save_checkpoint(model, optimizer, epoch, mean_val_loss, filename=last_model) - +# handle command line inputs, note we have to run a seperate command to train on each fold if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run satellite image processing model training.') parser.add_argument('--fold', type=int, help='CV fold') diff --git a/survey_processing/main_all_children.py b/survey_processing/main_all_children.py new file mode 100644 index 0000000..d961416 --- /dev/null +++ b/survey_processing/main_all_children.py @@ -0,0 +1,849 @@ +""" +Author: Luke Yang +Date: 2024-06-11 + +Adapted By: Jack Gidney +Date: 2024-07-22 + +This script processes DHS data to generate indicators of poverty and deprivation, then saves the data in train/test splits. + +Usage: + python main.py [config_file] + +Arguments: + parent_dir: The parent directory enclosing all DHS folders. + config_file (optional): The configuration parameters for preprocessing. Defaults to 'processing_params.json' if not provided. + +Functions: + get_poverty(source_path: str, save_csv: bool = False) -> pd.DataFrame: + Processes DHS data files to generate various deprivation indicators and optionally saves the result as a CSV file. + + Parameters: + source_path (str): The path to the directory containing the DHS data files. + save_csv (bool): If True, saves the resulting DataFrame to a CSV file in the source directory. + + Returns: + pd.DataFrame: The processed data with deprivation indicators. + + process_dhs(parent_dir: str, config_file: str) -> None + Uses the get_poverty() function to get DHS data and deprivation indicators, aggregates to the cluster level. + Adds GPS data, saves as a CSV and saves data in train and test folds. + + Parameters: + parent_dir (str): The path to the directory containing the DHS data files. + config_file (str): The path to the JSON file containing the config information. + + Returns: + None + + find_sub_file(directory, pattern: str) -> str: + Finds and returns the filename in a directory that matches a given pattern. + + make_string(integer: int, length: int) -> str: + Pads beginning of string with zeros + + check_file_integrity(parent_dir: str, all_files:, country_code: string): -> bool + Checks DHS data contains all the countries specified in JSON config file. + + country_code_to_name(country_code: str) -> str + Converts country code to name of country (using JSON config file). +""" + + +import argparse +import os +import re +import pandas as pd +import numpy as np +import warnings +from tqdm import tqdm +import json +import geopandas as gpd +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import KFold +from pandas.api.types import is_numeric_dtype + + +# Ignore all warnings +warnings.filterwarnings('ignore') +# parent directory of this file, dhs data folder and processed dhs data folder +par_dir = r'C:/Users/jgidn/Documents/Summer Project/KidSat/survey_processing/' + + +def find_sub_file(directory, pattern:str): + for f in os.listdir(directory): + if pattern.lower() in f.lower(): + return f + + +def get_poverty(source_path, save_csv = False): + """ + For each DHS corresponding to a country and a specific year + We generate a DataFrame formed of the KR, IR and PR merged, saved as dhs_variables.csv + We generate another DataFrame, saved as poverty_variables.csv + This contains basic characteristics of the individual (sex, age, gender, ...) + As well as moderate/severe deprivation flags + If any of the KR, IR, PR do not have the correct columns (due to the age of the dataset) + Then we will not generate poverty_variables.csv + + Parameters: + source_path (string): File path to DHS folder i.e dhs_data/AO_2015_DHS_XXX + save_csv (boolean): Indicator whether to save poverty_variables and dhs_variables to csvs (False if debugging) + + Returns: + df (DataFrame): The DataFrame with the deprivation flags + + """ + + # get filepaths of KR, IR and PR by iterating through files in survey folder + for f in os.listdir(source_path): + if 'KR' in f: + child_datafile = os.path.join(source_path, f, find_sub_file(source_path+f,'dta')) + elif 'PR' in f: + household_datafile = os.path.join(source_path, f, find_sub_file(source_path+f,'dta')) + elif 'IR' in f: + individual_datafile = os.path.join(source_path, f, find_sub_file(source_path+f,'dta')) + + # grab the survey year and country code from folder name + survey_year = source_path.split('/')[-2][3:7] + country_code = source_path.split('/')[-2][:2] + + # if any of the datasets are too old then they don't have the all the columns we need + # we try and grab these columns from each of the datasets + # if any are not available then we do not create poverty_variables.csv, and we *may not join PR/IR to KR + missing_pr_cols = False + missing_ir_cols = False + missing_kr_cols = False + cols_from_kr = ["v000", "v001", "v002", "v004", "b16", "b19", "v005", "h10", + "h3", "h5", "h7", "h9", "h31", "h31b", "h31c", "h32y", "h32z"] + cols_from_ir = ["v626a", "v312", "v001", "v002", "v003"] + cols_from_pr = ['hv000', 'hv005', 'hv007', 'hv111', 'hv113', 'hv009', + 'hv024', 'hv025', 'hv104', 'hv105', 'hv109', 'hv121', + 'hv122', 'hv201', 'hv204', 'hv205', 'hv216', 'hv225', + 'hv270', 'hv271', 'hc70', 'hv001', 'hv002', 'hvidx'] + + # read KR + dhs_kr = pd.read_stata(child_datafile, convert_categoricals=False) + + # we move age in months from hw1 to b19 if b19 not available + if 'b19' not in dhs_kr.columns: + dhs_kr['b19'] = dhs_kr['hw1'] + + # subset columns of KR and update missing cols flag + cols_to_subset = [] + for col in cols_from_kr: + if col in dhs_kr.columns: + cols_to_subset.append(col) + else: + missing_kr_cols = True + dhs_kr = dhs_kr[cols_to_subset] + + # add child weight column + dhs_kr['chweight'] = dhs_kr['v005'] / 1000000 + + # older datasets have no line numbers for the children in the KR (b16) + if "b16" in dhs_kr.columns: + # remove any child without a line number + dhs_kr = dhs_kr[(dhs_kr['b16'].notna()) & (dhs_kr['b16'] != 0)] + + # rename identifier columns, drop duplicate rows before merge + dhs_kr = dhs_kr.rename(columns={"v001" : "hv001", "v002" : "hv002", "b16" : "hvidx"}) + dhs_kr = dhs_kr.drop_duplicates(subset=['hv001', 'hv002', 'hvidx']) + else: + dhs_kr = dhs_kr.rename(columns={"v001" : "hv001", "v002" : "hv002"}) + + # read PR file, subset columns, add hh weight column + dhs_pr = pd.read_stata(household_datafile, convert_categoricals=False) + + # subset columns of PR and update pov_dfs flag + cols_to_subset = [] + for col in cols_from_pr: + if col in dhs_pr.columns: + cols_to_subset.append(col) + else: + missing_pr_cols = True + dhs_pr = dhs_pr[cols_to_subset] + + # add household weight column + dhs_pr['hhweight'] = dhs_pr['hv005'] / 1000000 + + # read IR file + dhs_ir = pd.read_stata(individual_datafile, convert_categoricals=False) + + # subset columns of IR and update pov_dfs flag + cols_to_subset = [] + for col in cols_from_ir: + if col in dhs_ir.columns: + cols_to_subset.append(col) + else: + missing_ir_cols = True + dhs_ir = dhs_ir[cols_to_subset] + + # rename identifier columns of IR + dhs_ir = dhs_ir.rename(columns={"v001" : "hv001", "v002" : "hv002", "v003" : "hvidx"}) + + # before any joins we need to check that it is possible + # note that some older datasets have no child line numbers in KR + if ("hv001" in dhs_pr.columns) and ("hv002" in dhs_pr.columns) and ("hvidx" in dhs_pr.columns) \ + and ("hvidx" in dhs_kr.columns): + # KR outer join to PR + dhs_kr = dhs_kr.merge(dhs_pr, how="outer", on=["hv001", "hv002", "hvidx"]) + + if ("hv001" in dhs_ir.columns) and ("hv002" in dhs_ir.columns) and ("hvidx" in dhs_ir.columns) \ + and ("hvidx" in dhs_kr.columns): + # IR outer join to PR/KR + dhs_kr = dhs_kr.merge(dhs_ir, how="outer", on=["hv001", "hv002", "hvidx"]) + + # remove all adults, if hv105 doesn't exist then we must be dealing with just the KR (MD_1997 and TZ_1999) + if "hv105" in dhs_kr: + dhs_kr = dhs_kr[dhs_kr["hv105"] < 18] + + # clean up after merges + # replace/rename ultimate area code (v004) and v001 with hv001 + if "hv001" in dhs_kr.columns: + dhs_kr = dhs_kr.drop("v004", axis=1) + else: + dhs_kr = dhs_kr.rename(columns={"v004" : "hv001"}) + + # replace/rename country code + if ("hv000" in dhs_kr.columns): + dhs_kr = dhs_kr.drop("v000", axis=1) + else: + dhs_kr = dhs_kr.rename(columns={"v000" : "hv000"}) + + # add survey year column from name of folder it's contained in + dhs_kr['countrycode'] = country_code + dhs_kr['year'] = survey_year + dhs_kr['survey'] = 'DHS' + + # save the final merged dataset + if save_csv: + output_path = source_path+'dhs_variables.csv' + dhs_kr.to_csv(output_path, index=False) + + # check whether we have the correct columns to create the pov_df + if missing_ir_cols or missing_kr_cols or missing_pr_cols: + return False + + # lets now create our pov df + df = dhs_kr + + ## calculate orphanhood proportion + + df["orphaned"] = ~(df['hv111'].astype(bool) & df['hv113'].astype(bool)) + df["orphaned"] = df["orphaned"].astype(float) + + ## calculate housing deprivation + + df['personsperroom'] = df['hv009'] / df['hv216'] + + # Generate severe housing deprivation flag + df['dep_housing_sev'] = (df['personsperroom'] >= 5).astype(int) + + # Generate moderate housing deprivation flag + df['dep_housing_mod'] = (df['personsperroom'] >= 3).astype(int) + + ## calculate water deprivation + + df['dep_water_sev'] = 0 # Default to 0 (no severe deprivation) + df.loc[df['hv201'].isin([32, 42, 43, 96]), 'dep_water_sev'] = 1 # Recode specific values to 1 + df.loc[df['hv201'] == 99, 'dep_water_sev'] = pd.NA # Set specific values to NaN + + # Generate 'dep_water_mod' as a copy of 'dep_water_sev' + df['dep_water_mod'] = df['dep_water_sev'] + + # Update 'dep_water_mod' for moderate water deprivation conditions + # Here it's important to ensure no overwrite of previously set severe conditions + mask_mod = (df['dep_water_mod'] == 0) & (~df['hv201'].isin([32, 42, 43, 96])) & (df['hv204'] > 30) & (df['hv204'] <= 900) + df.loc[mask_mod, 'dep_water_mod'] = 1 + + ## calculate sanitation deprivation + + df['dep_sanitation_sev'] = 0 # Default to 0 (no severe deprivation) + df.loc[df['hv205'].isin([23, 31, 42, 43, 96]), 'dep_sanitation_sev'] = 1 # Recode specific values to 1 + df.loc[df['hv205'] == 99, 'dep_sanitation_sev'] = pd.NA # Set specific values to NaN + + # Generate 'dep_sanitation_mod' as a copy of 'dep_sanitation_sev' + df['dep_sanitation_mod'] = df['dep_sanitation_sev'] + + # Update 'dep_sanitation_mod' for moderate sanitation deprivation conditions + # Here it's important to ensure no overwrite of previously set severe conditions + mask_mod_sanitation = (df['dep_sanitation_mod'] == 0) & (~df['hv205'].isin([23, 31, 42, 43, 96])) & (df['hv225'] == 1) + df.loc[mask_mod_sanitation, 'dep_sanitation_mod'] = 1 + + ## calculate nutrition deprivation + + df['dep_nutrition_sev'] = 0 # Initialize with 0 (no severe deprivation) + df.loc[df['hc70'] <= -300, 'dep_nutrition_sev'] = 1 # Set to 1 where severe stunting occurs + + # Generate moderate nutrition deprivation based on stunting more than -2 standard deviations + df['dep_nutrition_mod'] = 0 # Initialize with 0 (no moderate deprivation) + df.loc[df['hc70'] <= -200, 'dep_nutrition_mod'] = 1 # Set to 1 where moderate stunting occurs + + ## calculate health deprivation + + age_filter = (df['b19'] >= 12) & (df['b19'] <= 35) # Children between 12 to 35 months + + # DPT 1 Deprivation + df['dpt1deprived'] = 0 # Initialize column + df.loc[age_filter & ((df['h10'] == 0) | (df['h3'] == 0)), 'dpt1deprived'] = 1 + df.loc[age_filter & (df['h3'].between(1, 3)), 'dpt1deprived'] = 0 + + # DPT 2 Deprivation + df['dpt2deprived'] = 0 # Initialize column + df.loc[age_filter & ((df['h10'] == 0) | (df['h5'] == 0)), 'dpt2deprived'] = 1 + df.loc[age_filter & (df['h5'].between(1, 3)), 'dpt2deprived'] = 0 + + # DPT 3 Deprivation + df['dpt3deprived'] = 0 # Initialize column + df.loc[age_filter & ((df['h10'] == 0) | (df['h7'] == 0)), 'dpt3deprived'] = 1 + df.loc[age_filter & (df['h7'].between(1, 3)), 'dpt3deprived'] = 0 + + # Measles Deprivation + df['measlesdeprived'] = 0 # Initialize column + df.loc[age_filter & ((df['h10'] == 0) | (df['h9'] == 0)), 'measlesdeprived'] = 1 + df.loc[age_filter & (df['h9'].between(1, 3)), 'measlesdeprived'] = 0 + + # Before aggregating, reorder columns to send new variables to the end + column_order = [col for col in df.columns if col not in ['dpt1deprived', 'dpt2deprived', 'dpt3deprived', 'measlesdeprived']] + \ + ['dpt1deprived', 'dpt2deprived', 'dpt3deprived', 'measlesdeprived'] + df = df[column_order] + + # Count missing values across the immunization indicators + df['hasmissvaccines'] = df[['dpt1deprived', 'dpt2deprived', 'dpt3deprived', 'measlesdeprived']].isnull().sum(axis=1) + + # Sum up the indicators to get total vaccines missed + df['sumvaccines'] = df[['dpt1deprived', 'dpt2deprived', 'dpt3deprived', 'measlesdeprived']].sum(axis=1) + + # Adjust for rows where any vaccines data is missing + df.loc[df['hasmissvaccines'].between(1, 4), 'sumvaccines'] = pd.NA + + # Generate moderate deprivation based on missing any of the four immunizations + df['moderatevaccinesdeprived'] = 0 # Initialize with 0 + df.loc[df['sumvaccines'].between(1, 4), 'moderatevaccinesdeprived'] = 1 + + # Generate severe deprivation if all four vaccines are missing + df['severevaccinesdeprived'] = 0 # Initialize with 0 + df.loc[df['sumvaccines'] == 4, 'severevaccinesdeprived'] = 1 + + # Identifying ARI symptoms in children aged 36 to 59 months + df['arisymptoms'] = 0 # Initialize the column + # Set arisymptoms to 1 based on UNICEF's definition of ARI + df.loc[(df['h31'] == 2) & (df['h31b'] == 1) & (df['h31c'].isin([1, 3])) & (df['b19'].between(36, 59)), 'arisymptoms'] = 1 + + # Severe threshold: Child had ARI symptoms and no treatment was sought + df['ariseverelydeprived'] = 0 # Initialize the column + df.loc[(df['arisymptoms'] == 1) & (df['h32y'].isin([1, 8, pd.NA])), 'ariseverelydeprived'] = 1 + df.loc[(df['arisymptoms'] == 1) & (df['h32y'] == 0), 'ariseverelydeprived'] = 0 + + # Moderate (+severe) threshold: Child had ARI symptoms, and no treatment was sought at an appropriate medical facility + df['arimoderatedeprived'] = 0 # Initialize the column + df.loc[(df['arisymptoms'] == 1) & (df['h32z'] == 0), 'arimoderatedeprived'] = 1 + df.loc[(df['arisymptoms'] == 1) & (df['h32z'] == 1), 'arimoderatedeprived'] = 0 + + # Treatment at inappropriate facilities could be handled as below if specifics were provided: + # Assuming h32z values for inappropriate facilities are explicitly defined or derived elsewhere in your data context + # Define inappropriate treatment facilities (example values) + inappropriate_facilities = [5110, 9995, 2300, 9998] + df.loc[(df['ariseverelydeprived'] == 0) & df['h32z'].isin(inappropriate_facilities), 'arimoderatedeprived'] = 1 + + # Filter for girls aged 15 to 17 who have unmet needs for family planning + age_filter = (df['hv105'] >= 15) & (df['hv105'] <= 17) + need_filter = (df['v626a'] >= 1) & (df['v626a'] <= 4) # Assuming these codes indicate the need for family planning + + # Severe deprivation: Girls who do not want to become pregnant but are not using contraception + df['contramethodseverelydep'] = 0 # Initialize column + df.loc[age_filter & need_filter & (df['v312'] == 0), 'contramethodseverelydep'] = 1 + df.loc[age_filter & need_filter & (df['v312'] == 99), 'contramethodseverelydep'] = pd.NA # Handling missing data as NaN + + # Moderate (+ severe) deprivation: Includes girls using traditional methods of contraception + df['contramethodmoderatedep'] = 0 # Initialize column + traditional_methods = [8, 9, 10] # Assuming these codes indicate traditional methods + df.loc[age_filter & need_filter & (df['v312'].isin([0] + traditional_methods)), 'contramethodmoderatedep'] = 1 + df.loc[age_filter & need_filter & (df['v312'] == 99), 'contramethodmoderatedep'] = pd.NA # Handling missing data as NaN + + # Reorder columns to place certain deprivation indicators at the end (optional, mostly for visualization) + df = df[['severevaccinesdeprived', 'contramethodseverelydep', 'ariseverelydeprived'] + [col for col in df.columns if col not in ['severevaccinesdeprived', 'contramethodseverelydep', 'ariseverelydeprived']]] + + # Calculate the number of missing indicators for severe health deprivation + df['hasmissseverehealth'] = df[['severevaccinesdeprived', 'contramethodseverelydep', 'ariseverelydeprived']].isnull().sum(axis=1) + + # Aggregate severe health deprivation indicators + df['sumseverehealth'] = df[['severevaccinesdeprived', 'contramethodseverelydep', 'ariseverelydeprived']].sum(axis=1, min_count=1) # min_count=1 ensures NaN if all are NaN + + # Exclude children missing in all severe health indicators + df.loc[df['hasmissseverehealth'] == 3, 'sumseverehealth'] = pd.NA # Set to NaN if all indicators are missing + + # Generate the severe health deprivation index + df['severehealthdep'] = 0 # Default to 0 (not deprived) + df.loc[df['sumseverehealth'] == 1, 'severehealthdep'] = 1 # Set to 1 if deprived in one or more indicators + + # Reorder columns for easier handling (optional) + df = df[['moderatevaccinesdeprived', 'contramethodmoderatedep', 'arimoderatedeprived'] + [col for col in df.columns if col not in ['moderatevaccinesdeprived', 'contramethodmoderatedep', 'arimoderatedeprived']]] + + # Calculate the number of missing indicators for moderate health deprivation + df['hasmissmoderatehealth'] = df[['moderatevaccinesdeprived', 'contramethodmoderatedep', 'arimoderatedeprived']].isnull().sum(axis=1) + + # Aggregate moderate health deprivation indicators + df['summoderatehealth'] = df[['moderatevaccinesdeprived', 'contramethodmoderatedep', 'arimoderatedeprived']].sum(axis=1, min_count=1) # min_count=1 ensures NaN if all are NaN + + # Exclude children missing in both indicators + df.loc[df['hasmissmoderatehealth'] == 3, 'summoderatehealth'] = pd.NA # Set to NaN if all indicators are missing + + # Generate the moderate health deprivation index + df['moderatehealthdep'] = 0 # Default to 0 (not deprived) + df.loc[df['summoderatehealth'] == 1, 'moderatehealthdep'] = 1 # Set to 1 if deprived in one or more indicators + + ## calculate education deprivation + + # Filter for the young cohort (5 to 14 years old) + age_filter = (df['hv105'] >= 5) & (df['hv105'] <= 14) + + # Severe Educational Deprivation + # Initial severe deprivation setup based on not attending school and no schooling level reached + df['severeedudeprivedbelow15'] = 0 + df.loc[age_filter & (df['hv109'] == 0) & (df['hv121'] == 0), 'severeedudeprivedbelow15'] = 1 + df.loc[age_filter & (df['hv109'] == 0) & (df['hv121'] == 2), 'severeedudeprivedbelow15'] = 0 + df.loc[age_filter & (df['hv109'] >= 1) & (df['hv109'] <= 5), 'severeedudeprivedbelow15'] = 0 + + # Moderate Educational Deprivation + # Starts with severe deprivation setup + df['moderateedudeprivedbelow15'] = df['severeedudeprivedbelow15'] + df.loc[age_filter & (df['hv121'] == 0), 'moderateedudeprivedbelow15'] = 1 + df.loc[age_filter & (df['hv121'] == 0) & (df['hv109'].between(4, 5)), 'moderateedudeprivedbelow15'] = 0 + df.loc[age_filter & (df['hv109'] == 0) & (df['hv122'] == 1), 'moderateedudeprivedbelow15'] = 1 + df.loc[age_filter & (df['hv109'] == 0) & ((df['hv121'] == 98) | df['hv121'].isna() | (df['hv122'] == 8)), 'moderateedudeprivedbelow15'] = 1 + + # Handling missing cases - setting to NaN where there's insufficient data to determine deprivation + missing_conditions = (df['hv109'].isna() | df['hv109'].isin([7, 8])) & df['hv121'].isna() + df.loc[missing_conditions, 'moderateedudeprivedbelow15'] = pd.NA + + ## older cohort + # Filter for the older cohort (15 to 17 years old) + older_cohort_filter = (df['hv105'] >= 15) & (df['hv105'] <= 17) + + # Severe Educational Deprivation for older cohort + df['severeedudeprived15older'] = 0 # Initialize with 0 + df.loc[older_cohort_filter & (df['hv121'] == 2), 'severeedudeprived15older'] = 0 + df.loc[older_cohort_filter & (df['hv109'] == 0) & (df['hv121'] == 0), 'severeedudeprived15older'] = 1 + df.loc[older_cohort_filter & (df['hv121'] == 0) & (df['hv109'] <= 1), 'severeedudeprived15older'] = 1 + df.loc[older_cohort_filter & (df['hv121'] == 0) & (df['hv109'] >= 2), 'severeedudeprived15older'] = 0 + df.loc[df['hv109'].isna() | (df['hv109'] == 8), 'severeedudeprived15older'] = pd.NA + + # Moderate Educational Deprivation for older cohort + df['moderateedudeprived15older'] = df['severeedudeprived15older'].copy() + df.loc[older_cohort_filter & (df['hv121'] == 2) & (df['hv122'] <= 1), 'moderateedudeprived15older'] = 1 + df.loc[older_cohort_filter & (df['hv121'] == 0) & (df['hv109'] < 4), 'moderateedudeprived15older'] = 1 + df.loc[older_cohort_filter & (df['hv121'] == 0) & (df['hv109'] >= 4), 'moderateedudeprived15older'] = 0 + df.loc[df['hv109'].isna() | (df['hv109'] == 8), 'moderateedudeprived15older'] = pd.NA + + # Aggregate severe deprivation across both age groups + df['severeedudeprivedgroup'] = 0 # Initialize with 0 + df.loc[(df['severeedudeprivedbelow15'] == 1) | (df['severeedudeprived15older'] == 1), 'severeedudeprivedgroup'] = 1 + + # Aggregate moderate deprivation across both age groups + df['moderateedudeprivedgroup'] = 0 # Initialize with 0 + df.loc[(df['severeedudeprivedgroup'] == 1) | (df['moderateedudeprivedbelow15'] == 1) | (df['moderateedudeprived15older'] == 1), 'moderateedudeprivedgroup'] = 1 + + ## final agg + + # Identifying missing data in moderate deprivation indicators + moderate_columns = [col for col in df.columns if 'dep_' in col and '_mod' in col] + df['hasmissmoderatepoor'] = df[moderate_columns].isnull().sum(axis=1) + + # Aggregating moderate deprivation indicators + df['summoderatepoor'] = df[moderate_columns].sum(axis=1, min_count=1) # Use min_count=1 to require at least one non-NA value + + # Discounting children missing in all moderate dimensions + df.loc[df['hasmissmoderatepoor'] == 6, 'summoderatepoor'] = pd.NA + + # Determining final incidence of moderate child poverty + df['moderatelydeprived'] = 0 # Default to not deprived + df.loc[df['summoderatepoor'] >= 1, 'moderatelydeprived'] = 1 + + # Identifying missing data in severe deprivation indicators + severe_columns = [col for col in df.columns if 'dep_' in col and '_sev' in col] + df['hasmissseverepoor'] = df[severe_columns].isnull().sum(axis=1) + + # Aggregating severe deprivation indicators + df['sumseverepoor'] = df[severe_columns].sum(axis=1, min_count=1) + + # Discounting children missing in all severe dimensions + df.loc[df['hasmissseverepoor'] == 6, 'sumseverepoor'] = pd.NA + + # Determining final incidence of severe child poverty + df['severelydeprived'] = 0 # Default to not deprived + df.loc[df['sumseverepoor'] >= 1, 'severelydeprived'] = 1 + + ## clean up of dataframe + + # Summarize hv007 to get the range + df['year_interview'] = df['hv007'] + year2_min = df['hv007'].min() + year2_max = df['hv007'].max() + df['year_interview_range'] = f"{year2_min}-{year2_max}" + + # Rename variables to make them more intuitive + df = df.rename(columns={ + "hv001" : "cluster", + "hv002" : "hhid", + "hvidx" : "indid", + 'hv025': 'location', + 'hv104': 'sex', + 'hv270': 'wealth', + 'hv271': 'wealthscore', + 'hv024': 'region', + 'hv105': 'age' + }) + + # Housing + df.rename(columns={'severecrowdingdep': 'dep_housing_sev', 'moderatecrowdingdep': 'dep_housing_mod'}, inplace=True, errors='ignore') + + # Water + df.rename(columns={'severewaterdep': 'dep_water_sev', 'moderatewaterdep': 'dep_water_mod'}, inplace=True, errors='ignore') + + # Sanitation + df.rename(columns={'severesanitationdeprived': 'dep_sanitation_sev', 'moderatesanitationdeprived': 'dep_sanitation_mod'}, inplace=True, errors='ignore') + + # Nutrition + df.rename(columns={'severestunting': 'dep_nutrition_sev', 'moderatestunting': 'dep_nutrition_mod'}, inplace=True, errors='ignore') + + # Initialize new columns for nutrition HAZ scores + df['nutrition_haz'] = pd.NA + df['nutrition_hazflag'] = pd.NA + + # Check if 'hc70' is in the DataFrame and use it to replace values in 'nutrition_haz' + if 'hc70' in df.columns: + df['nutrition_haz'] = df['hc70'] # Update nutrition_haz only if hc70 is within the specified range + df.loc[~df['hc70'].between(-300, 900), 'nutrition_haz'] = pd.NA # Assume the range condition needs to be applied + + # Renaming HAZ and HAZFLAG if they exist + df.rename(columns={'HAZ': 'haz', 'HAZFLAG': 'hazflag'}, inplace=True, errors='ignore') + + # If 'haz' is in the DataFrame, update 'nutrition_haz' and optionally 'nutrition_hazflag' + if 'haz' in df.columns: + df['nutrition_haz'] = df['haz'] + if 'hazflag' in df.columns: + df['nutrition_hazflag'] = df['hazflag'] + + # Health Related Renaming and Variable Generation + df.rename(columns={ + 'severehealthdep': 'dep_health_sev', + 'moderatehealthdep': 'dep_health_mod', + 'severevaccinesdeprived': 'health_vac_sevdep', + 'moderatevaccinesdeprived': 'health_vac_moddep', + 'ariseverelydeprived': 'health_ari_sevdep', + 'arimoderatedeprived': 'health_ari_moddep', + 'contramethodseverelydep': 'health_con_sevdep', + 'contramethodmoderatedep': 'health_con_moddep' + }, inplace=True, errors='ignore') + + # Initialize new health-related columns + df['health_polio'] = pd.NA + df['health_measles'] = pd.NA + + # Update measles based on measlesdeprived, if it exists + if 'measlesdeprived' in df.columns: + df['health_measles'] = df['measlesdeprived'].apply(lambda x: 0 if x == 1 else 1 if x == 0 else pd.NA) + + # Generate and update DPT related columns + for i in range(1, 4): + col_name = f'health_dpt{i}' + deprived_col = f'dpt{i}deprived' + df[col_name] = pd.NA + if deprived_col in df.columns: + df[col_name] = df[deprived_col].apply(lambda x: 0 if x == 1 else 1 if x == 0 else pd.NA) + + # Education Related Renaming + df.rename(columns={ + 'severeedudeprivedgroup': 'dep_education_sev', + 'moderateedudeprivedgroup': 'dep_education_mod', + 'severeedudeprivedbelow15': 'education_b15_sevdep', + 'moderateedudeprivedbelow15': 'education_b15_moddep', + 'severeedudeprived15older': 'education_15o_sevdep', + 'moderateedudeprived15older': 'education_15o_moddep' + }, inplace=True, errors='ignore') + + # Renaming columns for summary indicators of severe and moderate poverty + df.rename(columns={ + 'sumseverepoor': 'sumpoor_sev', + 'summoderatepoor': 'sumpoor_mod' + }, inplace=True, errors='ignore') + + # Rename severely and moderately deprived indicators + df.rename(columns={ + 'severelydeprived': 'deprived_sev', + 'moderatelydeprived': 'deprived_mod' + }, inplace=True, errors='ignore') + + # Check if 'deprived_sev' exists in the DataFrame and create or update if not + if 'deprived_sev' not in df.columns: + # Create the 'deprived_sev' based on 'sumpoor_sev' if it exists + if 'sumpoor_sev' in df.columns: + df['deprived_sev'] = (df['sumpoor_sev'] >= 1).astype(int) + else: + # If 'sumpoor_sev' also doesn't exist, you might need to calculate it based on a pattern + # Assuming dep_*_sev pattern for deprivation columns + sev_columns = [col for col in df.columns if 'dep_' in col and '_sev' in col] + df['sumpoor_sev'] = df[sev_columns].sum(axis=1, min_count=1) # min_count=1 to require at least one non-NA value + df['deprived_sev'] = (df['sumpoor_sev'] >= 1).astype(int) + + # Similar check and creation for 'deprived_mod' + if 'deprived_mod' not in df.columns: + if 'sumpoor_mod' in df.columns: + df['deprived_mod'] = (df['sumpoor_mod'] >= 1).astype(int) + else: + # Calculate 'sumpoor_mod' if not already present, based on dep_*_mod pattern + mod_columns = [col for col in df.columns if 'dep_' in col and '_mod' in col] + df['sumpoor_mod'] = df[mod_columns].sum(axis=1, min_count=1) + df['deprived_mod'] = (df['sumpoor_mod'] >= 1).astype(int) + + # Keep only relevant variables + columns_to_keep = [ + col for col in df.columns if ( + 'countrycode' in col or 'year' in col or 'survey' in col or 'version' in col or + 'round' in col or 'cluster' in col or 'hhid' in col or 'indid' in col or + 'chweight' in col or 'hhweight' in col or 'location' in col or 'sex' in col or + 'wealth' in col or 'region' in col or 'age' in col or 'orphaned' in col or + col.startswith('dep_') or col.startswith('education_') or + col.startswith('health_') or col.startswith('nutrition_') or + 'sumpoor_' in col or 'deprived_' in col + ) + ] + df = df[columns_to_keep] + + # reset index + df = df.reset_index(drop=True) + + # Sort and order DataFrame + df.sort_values(by=['cluster', 'hhid', 'indid'], inplace=True) + + # Optionally compress the DataFrame before saving + df = df.convert_dtypes() + + # Optionally save DataFrame + file_name = f"poverty_variables.csv" + if save_csv: + df.to_csv(os.path.join(source_path,file_name), index=False) + return df + + +def make_string(integer, length = 8): + return str(integer).zfill(length) + + +def check_file_integrity(parent_dir, all_files, country_code): + complete = True + for f in all_files: + if not any(f in string for string in os.listdir(parent_dir)): + print(f'{country_code[f[:2]]}\'s data in year {f[-4:]} is missing.') + complete = False + return complete + + +def process_dhs(parent_dir, config_file): + """ + Uses the get_poverty() function to get DHS data and deprivation indicators for each country/year surveyed. + Then all the dhs/poverty/gps data is combined into 3 dataframes which are aggregated by cluster + This is split into train and test folds, and saved as csvs. + + Parameters: + parent_dir (str): The path to the directory containing the DHS data files. + config_file (str): The path to the JSON file containing the config information. + + Returns: + None + """ + # load config json files, check all data is in folder + if parent_dir[-1]!='/': + parent_dir+=r'/' + with open(config_file, 'r') as file: + config_data = json.load(file) + with open(f'{par_dir}dhs_country_code.json', 'r') as file: + dhs_cc = json.load(file) + save_to_csv_dir = os.path.join(parent_dir, "dhs_variables.csv") + + print('Checking file integrity...') + if not check_file_integrity(parent_dir, config_data['all_DHS'], dhs_cc): + raise FileNotFoundError('DHS data incomplete') + + # create all dhs_variables and poverty_variables datasets + pov_dfs = [] + print('Summarizing poverty...') + for f in tqdm(os.listdir(parent_dir)): + if 'DHS' in f: + pov_df = get_poverty(parent_dir+f+'/', save_csv=True) # True + # if returned false then we ignore + if type(pov_df) == pd.DataFrame: + pov_dfs.append(pov_df) + + dhs_dfs = [] + print('Extracting DHS variables...') + for f in tqdm(os.listdir(parent_dir)): + if 'DHS' in f: + dhs_dfs.append(pd.read_csv(os.path.join(parent_dir,f,'dhs_variables.csv'))) + + # group dhs data by cluster and preprocess based on config from JSON file + thresholds = config_data['dhs_variable_lim'] + columns_to_encode = config_data['categorical_columns'] + matches = config_data['matches'] + + dhs_dfs_agg = [] + print('Aggregating DHS variables...') + for df in tqdm(dhs_dfs): + ccode = df.loc[0, 'countrycode'] + year = str(df.loc[0, "year"]) + for column, threshold in thresholds.items(): + if column in df.columns: + df = df[(df[column] <= threshold) | (df[column].isna())] + + filtered_columns_to_encode = [col for col in columns_to_encode if col in df.columns] + for col in df.columns: + if col in filtered_columns_to_encode: + df[col] = df[col].astype('Int64') + + df = pd.get_dummies(df, columns=filtered_columns_to_encode) + df_agg = df.select_dtypes(include=[np.number, bool]).groupby('hv001').agg('mean').reset_index() + df_agg['id'] = ccode + year + df_agg['hv001'].apply(make_string) + + dhs_dfs_agg.append(df_agg) + dhs_df_all = pd.concat(dhs_dfs_agg) + + existing_cols = [col for col in matches if col in dhs_df_all.columns] + + additional_cols = [] + for col in matches: + if col not in dhs_df_all.columns: + pattern_cols = [c for c in dhs_df_all.columns if c.startswith(f"{col}_")] + additional_cols.extend(pattern_cols) + cols_to_select = existing_cols + additional_cols + ['id'] + dhs_df_all = dhs_df_all[list(set(cols_to_select))] + + # group poverty data by cluster + print('Aggregating poverty data...') + poverty_dfs_agg = [] + for df in tqdm(pov_dfs): + ccode = df.loc[0, 'countrycode'] + year = str(df.loc[0, "year"]) + df_agg = df.select_dtypes(include=[np.number]).groupby('cluster').agg('mean').reset_index() + df_agg['id'] = ccode + year + df_agg['cluster'].apply(make_string) + poverty_dfs_agg.append(df_agg) + pov_df_all = pd.concat(poverty_dfs_agg) + + ## Calculating Centroids + gdfs = [] + for f in os.listdir(parent_dir): + if 'DHS' in f: + for sub_f in os.listdir(os.path.join(parent_dir,f)): + if sub_f.__contains__('GE'): + shape_file = os.path.join(parent_dir, f, sub_f) + gdf = gpd.read_file(shape_file) + # Append to the list of GeoDataFrames. + gdfs.append(gdf) + combined_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True)) + + + def country_code_to_name(country_code): + return dhs_cc[country_code] + + + combined_gdf['COUNTRY'] = combined_gdf['DHSCC'].apply(country_code_to_name) + combined_gdf['SURVEY_NAME'] = [combined_gdf.iloc[i]['COUNTRY']+'_DHS_'+str(int(combined_gdf.iloc[i]['DHSYEAR'])) for i in range(combined_gdf.shape[0])] + combined_gdf['YEAR'] =combined_gdf['DHSYEAR'].apply(int) + combined_gdf['CENTROID_ID'] = combined_gdf['DHSID'] + + centroid_df = combined_gdf[['CENTROID_ID', 'SURVEY_NAME', 'COUNTRY','YEAR', 'LATNUM', 'LONGNUM']] + centroid_df = centroid_df[~((centroid_df['LATNUM'] == 0) & (centroid_df['LONGNUM'] == 0))] + centroid_df.drop_duplicates(inplace=True) + pov_df_all.drop_duplicates(inplace=True) + dhs_df_all.drop_duplicates(inplace=True) + centroid_df = centroid_df.reset_index(drop=True) + + # merge dhs, poverty data and GPS data + merged_centroid_df = pd.merge(centroid_df, pov_df_all, left_on='CENTROID_ID', right_on='id', how='left') + merged_centroid_df = pd.merge(merged_centroid_df, dhs_df_all, left_on='CENTROID_ID', right_on='id', how='left') + + # remove some cols after join + merged_centroid_df = merged_centroid_df.drop(["hhid", "indid", "id_x", "id_y", "year_interview"], axis=1) + + # remove all over 16s + # merged_centroid_df = merged_centroid_df[merged_centroid_df["age"] < 16] + + # only want to scale certain columns + no_scale_cols = ["CENTROID_ID", "SURVEY_NAME", "COUNTRY", "YEAR", + "LATNUM", "LONGNUM", "cluster"] + df_subset = merged_centroid_df.drop(no_scale_cols, axis=1) + # Remove columns if all values are NaN + df_subset = df_subset.dropna(axis=1, how='all') + + # Dictionary to store min-max values + min_max_dict = {} + + # Function to scale columns and record min-max values + def scale_column(col): + if is_numeric_dtype(col): + min_val = col.min() + max_val = col.max() + if (min_val < 0) or (max_val > 1): + scaler = MinMaxScaler() + scaled_col = scaler.fit_transform(col.values.reshape(-1, 1)).flatten() + min_max_dict[col.name] = {'min': min_val, 'max': max_val} + return scaled_col + return col + + # Apply scaling to appropriate columns + df_scaled = df_subset.apply(scale_column) + + # Combine the original first 7 columns with the processed subset + df_processed = pd.concat([merged_centroid_df.loc[:, no_scale_cols], df_scaled], axis=1) + # Save min-max dictionary locally + with open('min_max_values.json', 'w') as f: + json.dump(min_max_dict, f, indent=4) + + col_pattern = r"^[a-zA-Z]*\d*_[^a-zA-Z]" + matching_columns = [col for col in df_processed.columns if re.match(f"^{col_pattern}", col)] + + # Fill NaN values with 0 in the matched columns + df_processed[matching_columns] = df_processed[matching_columns].fillna(0) + + df_processed.to_csv(save_to_csv_dir, index=False) + + save_split(df_processed) + + +def save_split(df): + save_par_dir = r'C:/Users/jgidn/Documents/Summer Project/KidSat/survey_processing/processed_data/' + df.to_csv(f'{save_par_dir}dhs_variables.csv', index=False) + df = df.sample(frac=1, random_state=42) + kf = KFold(n_splits=5, shuffle=True, random_state=42) + fold = 1 + for train_index, test_index in kf.split(df): + # Generate train and test subsets + train_df = df.iloc[train_index] + test_df = df.iloc[test_index] + + # Save to CSV files + train_df.to_csv(f'{save_par_dir}train_fold_{fold}.csv', index=False) + test_df.to_csv(f'{save_par_dir}test_fold_{fold}.csv', index=False) + + fold += 1 + old_df = df[df['YEAR'] < 2020] + new_df = df[df['YEAR'] >= 2020] + new_df.to_csv(f'{save_par_dir}after_2020.csv', index=False) + old_df.to_csv(f'{save_par_dir}before_2020.csv', index=False) + + +def main(): + # Setup argument parser + parser = argparse.ArgumentParser(description="Process DHS data to a single CSV file.") + parser.add_argument("parent_dir", help="The parent directory enclosing all DHS folders") + parser.add_argument("config_file", nargs='?', default=f'{par_dir}processing_params.json', help="The configuration parameters for preprocessing (default: processing_params.json)") + # Parse arguments + args = parser.parse_args() + + # Call the download function with the parsed arguments + process_dhs(args.parent_dir, args.config_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/survey_processing/processing_params.json b/survey_processing/processing_params.json index 30c848d..81685ef 100644 --- a/survey_processing/processing_params.json +++ b/survey_processing/processing_params.json @@ -16,7 +16,9 @@ "hv216": 24, "hv225": 2, "hv271": 1e5, - "v312": 20 + "v312": 20, + "hv111": 2, + "hv113": 2 }, "categorical_columns": [ "h10", @@ -68,7 +70,9 @@ "v003", "v005", "v007", - "v312" + "v312", + "hv111", + "hv113" ], "all_DHS": [ "ZM_2013",