issue_21_ultrasound_form_delivery_date.py

Debbie Bitran, 21/06/2024 04:46

Download (3.72 KB)

 
1
# -*- coding: utf-8 -*-
2
"""issue_21_ultrasound_form_delivery_date.ipynb
3

4
Automatically generated by Colab.
5

6
Original file is located at
7
    https://colab.research.google.com/drive/1o_D5W6ARSNXdY-9G3g36h6B_R16cWs6g
8

9
## 1. Required dependencies
10

11
### ***please load the packages and update the constants/variables in the cell below. after that you can run the remainder of the script to reproduce the steps I took to audit the issue. thanks!***
12
"""
13

    
14
import pandas as pd
15
import numpy as np
16

    
17
# constant values needed to run script
18
# please update the values below
19

    
20
# export csv from: EDC > Flourish Caregiver > Birth Form
21
PATH_TO_MATERNAL_DELIVERY = '/MaternalDelivery-2024-06-20.csv'
22

    
23
# export csv from: EDC > Flourish Caregiver > Ultrasound Form
24
PATH_TO_ULTRASOUND = '/UltraSound-2024-06-14.csv'
25

    
26
# quick function to check df dimensions against n unique chilpid's to verify no duplicates
27
def print_df_dimensions(df):
28
    print(f"""checking dimensions:
29
          dimensions: {df.shape}
30
          nunique childpid: {df["childpid"].nunique()}
31
          ---------
32
          """)
33

    
34
# You can now run the rest of the script!
35

    
36
"""## 2. Run the script below to reproduce the data anomalies highlighted in this issue
37

38
### Loading data
39
"""
40

    
41
# loading maternal delivery data
42
maternal_delivery_df = pd.read_csv(PATH_TO_MATERNAL_DELIVERY)
43

    
44
# verifying no duplicates
45
print_df_dimensions(maternal_delivery_df)
46

    
47
# selecting columns relevant to issue
48
maternal_delivery_df = maternal_delivery_df.filter(regex = "childpid|matpid|datetime")
49

    
50
# printing columns to confirm appropriate variables selected
51
print("maternal_delivery_df columns selected:\n\t", maternal_delivery_df.columns.tolist())
52
# renaming columns with suffix to identify CRF data was pulled from
53
maternal_delivery_df = maternal_delivery_df.rename(columns={"report_datetime":"report_datetime_matdeliverycrf", "delivery_datetime":"delivery_datetime_matdeliverycrf"})
54

    
55
print("\nnew column names for merging with ultrasound df:\n\t", maternal_delivery_df.columns.tolist())
56

    
57
# loading ultrasound form data
58
ultrasound_df = pd.read_csv(PATH_TO_ULTRASOUND)
59

    
60
# verifying no duplicates
61
print_df_dimensions(ultrasound_df)
62

    
63
# selecting columns relevant to issue
64
ultrasound_df = ultrasound_df.filter(regex = "childpid|matpid|datetime|delivery_date")
65

    
66
# printing columns to confirm appropriate variables selected
67
print("ultrasound_df columns selected:\n\t", ultrasound_df.columns.tolist())
68
# renaming columns with suffix to identify CRF data was pulled from
69
ultrasound_df = ultrasound_df.rename(columns={"report_datetime":"report_datetime_ultrasoundcrf", "maternal_delivery_date":"maternal_delivery_date_ultrasoundcrf"})
70

    
71
print("\nnew column names for merging with maternal delivery df:\n\t", ultrasound_df.columns.tolist())
72

    
73
"""### Merging `maternal_delivery_df` and `ultrasound_df` on `childpid` to compare values for delivery dates across the two CRFs"""
74

    
75
# merging maternal_delivery_df and ultrasound_df on childpid
76
merged_df = pd.merge(ultrasound_df, maternal_delivery_df, on=['childpid', 'matpid'], how='left')
77

    
78
# verifying merge
79
print_df_dimensions(merged_df)
80

    
81
"""### Creating `issue_df` with records relevant to current issue by filtering `merged_df` where:
82
- delivery date is not missing
83
- delivery date in `maternal_delivery_df` does not match/equal delivery date in `ultrasound_df`
84
"""
85

    
86
# filtering observations where delivery_datetime_matdeliverycrf does NOT equal maternal_delivery_date_ultrasoundcrf
87
issue_df = merged_df.copy()[~(merged_df['delivery_datetime_matdeliverycrf'] == merged_df['maternal_delivery_date_ultrasoundcrf'])]
88

    
89
# removing observations for subjects who haven't been born yet (i.e., where delivery date is a missing value)
90
issue_df = issue_df[issue_df['delivery_datetime_matdeliverycrf'].notna()]
91

    
92
# verifying merge
93
print_df_dimensions(issue_df)