Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions src/soep_preparation/clean_modules/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Clean and convert SOEP health variables to appropriate data types."""

import pandas as pd

from soep_preparation.utilities.data_manipulator import (
apply_smallest_int_dtype,
create_dummy,
float_to_float,
)


def clean(raw_data: pd.DataFrame) -> pd.DataFrame:
"""Create cleaned variables from the health module.

The health module is a generated person-year dataset with health
indicators collected in a two-year replication cycle since 2002. Its
centerpiece is the SOEP version of the SF-12v2: eight norm-based
subscales plus the physical (PCS) and mental (MCS) component summary
scales, all standardized to mean 50 and standard deviation 10 in the
SOEP 2004 population, with higher values representing better
health-related quality of life.

Args:
raw_data: The raw health data.

Returns:
The processed health data.
"""
out = pd.DataFrame()

out["p_id"] = apply_smallest_int_dtype(raw_data["pid"])
out["hh_id_original"] = apply_smallest_int_dtype(raw_data["cid"])
out["survey_year"] = apply_smallest_int_dtype(raw_data["syear"])

# Component summary scales. The SF-12 scores and BMI arrive as floats with
# SOEP missing codes encoded as negative values.
out["sf12_pcs"] = float_to_float(raw_data["pcs"], code_negative_values_as_na=True)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea what sf12, mcs, nbs, pcs actually mean, but keep it if it's jargon.

out["sf12_mcs"] = float_to_float(raw_data["mcs"], code_negative_values_as_na=True)

# The eight norm-based subscales underlying the summary scales.
out["sf12_physical_functioning_nbs"] = float_to_float(
raw_data["pf_nbs"], code_negative_values_as_na=True
)
out["sf12_role_physical_nbs"] = float_to_float(
raw_data["rp_nbs"], code_negative_values_as_na=True
)
out["sf12_bodily_pain_nbs"] = float_to_float(
raw_data["bp_nbs"], code_negative_values_as_na=True
)
out["sf12_general_health_nbs"] = float_to_float(
raw_data["gh_nbs"], code_negative_values_as_na=True
)
out["sf12_vitality_nbs"] = float_to_float(
raw_data["vt_nbs"], code_negative_values_as_na=True
)
out["sf12_social_functioning_nbs"] = float_to_float(
raw_data["sf_nbs"], code_negative_values_as_na=True
)
out["sf12_role_emotional_nbs"] = float_to_float(
raw_data["re_nbs"], code_negative_values_as_na=True
)
out["sf12_mental_health_nbs"] = float_to_float(
raw_data["mh_nbs"], code_negative_values_as_na=True
)

# Whether all twelve items needed for the SF-12 scoring are complete.
out["sf12_valid"] = create_dummy(
series=raw_data["valid"],
value_for_comparison="[1] Yes",
comparison_type="equal",
)

out["bmi_health"] = float_to_float(raw_data["bmi"], code_negative_values_as_na=True)

return out
Loading
Loading