diff --git a/Data_Analysis/ALD.ipynb b/Data_Preparation/ALD.ipynb similarity index 100% rename from Data_Analysis/ALD.ipynb rename to Data_Preparation/ALD.ipynb diff --git a/Data_Analysis/Data_Finess.ipynb b/Data_Preparation/Data_Finess.ipynb similarity index 100% rename from Data_Analysis/Data_Finess.ipynb rename to Data_Preparation/Data_Finess.ipynb diff --git a/Data_Analysis/Data_Professionnels.ipynb b/Data_Preparation/Data_Professionnels.ipynb similarity index 100% rename from Data_Analysis/Data_Professionnels.ipynb rename to Data_Preparation/Data_Professionnels.ipynb diff --git a/Data_Analysis/Prepa_ALD.ipynb b/Data_Preparation/Prepa_ALD.ipynb similarity index 100% rename from Data_Analysis/Prepa_ALD.ipynb rename to Data_Preparation/Prepa_ALD.ipynb diff --git a/Data_Analysis/Prepa_Acts.ipynb b/Data_Preparation/Prepa_Acts.ipynb similarity index 100% rename from Data_Analysis/Prepa_Acts.ipynb rename to Data_Preparation/Prepa_Acts.ipynb diff --git a/Data_Analysis/Prepa_Hospitalisations.ipynb b/Data_Preparation/Prepa_Hospitalisations.ipynb similarity index 100% rename from Data_Analysis/Prepa_Hospitalisations.ipynb rename to Data_Preparation/Prepa_Hospitalisations.ipynb diff --git a/Data_Analysis/Prepa_Medecins.ipynb b/Data_Preparation/Prepa_Medecins.ipynb similarity index 100% rename from Data_Analysis/Prepa_Medecins.ipynb rename to Data_Preparation/Prepa_Medecins.ipynb diff --git a/Data_Analysis/Prepa_Visits.ipynb b/Data_Preparation/Prepa_Visits.ipynb similarity index 100% rename from Data_Analysis/Prepa_Visits.ipynb rename to Data_Preparation/Prepa_Visits.ipynb diff --git a/Data_Analysis/Prepa_drugs.ipynb b/Data_Preparation/Prepa_drugs.ipynb similarity index 100% rename from Data_Analysis/Prepa_drugs.ipynb rename to Data_Preparation/Prepa_drugs.ipynb diff --git a/Data_Analysis/Prepa_pop.ipynb b/Data_Preparation/Prepa_pop.ipynb similarity index 100% rename from Data_Analysis/Prepa_pop.ipynb rename to Data_Preparation/Prepa_pop.ipynb diff --git a/Data_Analysis/Public_Data_Explo.ipynb b/Data_Preparation/Public_Data_Explo.ipynb similarity index 100% rename from Data_Analysis/Public_Data_Explo.ipynb rename to Data_Preparation/Public_Data_Explo.ipynb diff --git a/SNDSGenerator/apply_schemas_fixes.sh b/Data_Preparation/apply_schemas_fixes.sh similarity index 100% rename from SNDSGenerator/apply_schemas_fixes.sh rename to Data_Preparation/apply_schemas_fixes.sh diff --git a/SNDSGenerator/create_nomenclatures.py b/Data_Preparation/create_nomenclatures.py similarity index 100% rename from SNDSGenerator/create_nomenclatures.py rename to Data_Preparation/create_nomenclatures.py diff --git a/SNDSGenerator/prepare.py b/Data_Preparation/prepare.py similarity index 100% rename from SNDSGenerator/prepare.py rename to Data_Preparation/prepare.py diff --git a/AVCGenerator/2016_ald-prevalentes_serie-annuelle.xls b/Models/AVC/2016_ald-prevalentes_serie-annuelle.xls similarity index 100% rename from AVCGenerator/2016_ald-prevalentes_serie-annuelle.xls rename to Models/AVC/2016_ald-prevalentes_serie-annuelle.xls diff --git a/AVCGenerator/TauxAVC_geode.csv b/Models/AVC/TauxAVC_geode.csv similarity index 100% rename from AVCGenerator/TauxAVC_geode.csv rename to Models/AVC/TauxAVC_geode.csv diff --git a/AVCGenerator/TauxHospi_Geode_2019.csv b/Models/AVC/TauxHospi_Geode_2019.csv similarity index 100% rename from AVCGenerator/TauxHospi_Geode_2019.csv rename to Models/AVC/TauxHospi_Geode_2019.csv diff --git a/AVCGenerator/cip13_B01AC.pkl b/Models/AVC/cip13_B01AC.pkl similarity index 100% rename from AVCGenerator/cip13_B01AC.pkl rename to Models/AVC/cip13_B01AC.pkl diff --git a/AVCGenerator/create_eventtable.ipynb b/Models/AVC/create_eventtable.ipynb similarity index 100% rename from AVCGenerator/create_eventtable.ipynb rename to Models/AVC/create_eventtable.ipynb diff --git a/AVCGenerator/data_analysis.ipynb b/Models/AVC/data_analysis.ipynb similarity index 100% rename from AVCGenerator/data_analysis.ipynb rename to Models/AVC/data_analysis.ipynb diff --git a/AVCGenerator/data_preparation.ipynb b/Models/AVC/data_preparation.ipynb similarity index 100% rename from AVCGenerator/data_preparation.ipynb rename to Models/AVC/data_preparation.ipynb diff --git a/AVCGenerator/data_preparation_simple.ipynb b/Models/AVC/data_preparation_simple.ipynb similarity index 100% rename from AVCGenerator/data_preparation_simple.ipynb rename to Models/AVC/data_preparation_simple.ipynb diff --git a/AVCGenerator/generate_avcdataset.py b/Models/AVC/generate_avcdataset.py similarity index 100% rename from AVCGenerator/generate_avcdataset.py rename to Models/AVC/generate_avcdataset.py diff --git a/AVCGenerator/generate_eins.py b/Models/AVC/generate_eins.py similarity index 100% rename from AVCGenerator/generate_eins.py rename to Models/AVC/generate_eins.py diff --git a/AVCGenerator/mortality_weibull_params.pkl b/Models/AVC/mortality_weibull_params.pkl similarity index 100% rename from AVCGenerator/mortality_weibull_params.pkl rename to Models/AVC/mortality_weibull_params.pkl diff --git a/AVCGenerator/pathway_avc.py b/Models/AVC/pathway_avc.py similarity index 100% rename from AVCGenerator/pathway_avc.py rename to Models/AVC/pathway_avc.py diff --git a/AVCGenerator/ptavc.csv b/Models/AVC/ptavc.csv similarity index 100% rename from AVCGenerator/ptavc.csv rename to Models/AVC/ptavc.csv diff --git a/AVCGenerator/ptavc.pkl b/Models/AVC/ptavc.pkl similarity index 100% rename from AVCGenerator/ptavc.pkl rename to Models/AVC/ptavc.pkl diff --git a/AVCGenerator/test.py b/Models/AVC/test.py similarity index 100% rename from AVCGenerator/test.py rename to Models/AVC/test.py diff --git a/SNDSGenerator/AnalysePostGeneration.ipynb b/Models/GeneralPop/AnalysePostGeneration.ipynb similarity index 100% rename from SNDSGenerator/AnalysePostGeneration.ipynb rename to Models/GeneralPop/AnalysePostGeneration.ipynb diff --git a/SNDSGenerator/PostGenerationAnalysis.ipynb b/Models/GeneralPop/PostGenerationAnalysis.ipynb similarity index 100% rename from SNDSGenerator/PostGenerationAnalysis.ipynb rename to Models/GeneralPop/PostGenerationAnalysis.ipynb diff --git a/SNDSGenerator/PostGenerationAnalysis_Bretagne.ipynb b/Models/GeneralPop/PostGenerationAnalysis_Bretagne.ipynb similarity index 100% rename from SNDSGenerator/PostGenerationAnalysis_Bretagne.ipynb rename to Models/GeneralPop/PostGenerationAnalysis_Bretagne.ipynb diff --git a/SNDSGenerator/PostGenerationAnalysis_Brittany.ipynb b/Models/GeneralPop/PostGenerationAnalysis_Brittany.ipynb similarity index 100% rename from SNDSGenerator/PostGenerationAnalysis_Brittany.ipynb rename to Models/GeneralPop/PostGenerationAnalysis_Brittany.ipynb diff --git a/SNDSGenerator/PostGenerationAnalysis_Paris.ipynb b/Models/GeneralPop/PostGenerationAnalysis_Paris.ipynb similarity index 100% rename from SNDSGenerator/PostGenerationAnalysis_Paris.ipynb rename to Models/GeneralPop/PostGenerationAnalysis_Paris.ipynb diff --git a/SNDSGenerator/generation_3522.py b/Models/GeneralPop/generation_3522.py similarity index 100% rename from SNDSGenerator/generation_3522.py rename to Models/GeneralPop/generation_3522.py diff --git a/SNDSGenerator/generation_brittany.py b/Models/GeneralPop/generation_brittany.py similarity index 100% rename from SNDSGenerator/generation_brittany.py rename to Models/GeneralPop/generation_brittany.py diff --git a/SNDSGenerator/generation_paris.py b/Models/GeneralPop/generation_paris.py similarity index 100% rename from SNDSGenerator/generation_paris.py rename to Models/GeneralPop/generation_paris.py diff --git a/VICAN/Simu_VICAN.py b/Models/VICAN/Simu_VICAN.py similarity index 100% rename from VICAN/Simu_VICAN.py rename to Models/VICAN/Simu_VICAN.py diff --git a/VICAN/Simu_age.py b/Models/VICAN/Simu_age.py similarity index 100% rename from VICAN/Simu_age.py rename to Models/VICAN/Simu_age.py diff --git a/VICAN/pathways.py b/Models/VICAN/pathways.py similarity index 100% rename from VICAN/pathways.py rename to Models/VICAN/pathways.py diff --git a/Models/VICAN/test_VICAN.py b/Models/VICAN/test_VICAN.py new file mode 100644 index 0000000000000000000000000000000000000000..07ab1d450938400f7d13e47635560fdaef2311d2 --- /dev/null +++ b/Models/VICAN/test_VICAN.py @@ -0,0 +1,86 @@ + +##### Simulation âge + +import logging + +import numpy as np +import pandas as pd +from scipy.stats import truncnorm + +from pathways import P1, P2, P3, P4, P5, P6, P7, P8, P9, P10 +from SNDSGenerator.simuExternal import simuExternal +from SNDSGenerator.simulationDB import simDB + + +logging.basicConfig(filename="debug.log", encoding="utf-8", level=logging.DEBUG) + +total_patients = 100 + +# Définition des proportions pour chaque tranche d'âge +proportions = [0.2143, 0.2571, 0.2857, 0.2429] + +# Ajuster les paramètres pour les lois normales tronquées +# (moyenne, écart-type, borne inférieure, borne supérieure) +params = [ + (48, 7, 20, 49), # <50 ans + (54, 3.5, 50, 59), # 50-59 ans + (65, 4, 60, 69), # 60-69 ans + (75, 6, 70, 95), # ≥70 ans +] + +# Calcul des effectifs en arrondissant à l'entier le plus proche +n_points_per_group = [int(round(total_patients * prop)) for prop in proportions] + +# Ajuster le dernier groupe pour compenser l'écart +n_points_per_group[-1] += total_patients - sum(n_points_per_group) + + +# Fonction pour générer les données en utilisant une loi normale tronquée +def generate_truncated_normal(mean, std, lower, upper, n): + a, b = (lower - mean) / std, (upper - mean) / std + return truncnorm.rvs(a, b, loc=mean, scale=std, size=n) + + +# Génération des données +samples1 = [] +age_labels = [] +for i, (n_points, (mean, std, lower, upper)) in enumerate( + zip(n_points_per_group, params) +): + samples1.extend(generate_truncated_normal(mean, std, lower, upper, n_points)) + age_labels.extend([i] * n_points) + +# Convertir en numpy array pour manipulation +samples1 = np.array(np.round(samples1)) + +# creation of the population table representing the population : +# - all patients have an Undefined status in this profile +population = pd.DataFrame({"sex": 1, "age": samples1, "status": "U"}) + +population["pathway"] = "P6" + +simu = { + "patients": { + "data": population, + "attributes": { + "sex": "sex", + "age": "age", + }, + } +} + +simulator = simuExternal(nomencl="datarep/snds_nomenclature.db", datarep="datarep") +simulator.run() +simulator.load(simu) + +simulator.context.year = 2022 + +# injection of the pathways +for P in [P1, P2, P3, P4, P5, P6, P7, P8, P9, P10]: + p = P(simulator) + for patient in simulator.patients: + p.injection(patient) + +dbgen = simDB() +dbgen.output_db_name = "Marie/snds_vican.db" +dbgen.generate(simulator, rootschemas="schema-snds/schemas") diff --git a/SNDSGenerator/test_simubrute.py b/Models/simubrute.py similarity index 100% rename from SNDSGenerator/test_simubrute.py rename to Models/simubrute.py diff --git a/README.md b/README.md index 74bb8746df762f0648833fccdbe322bd1eb48123..bbe2a23ed421a5e1031471445d89659a341ee74d 100644 --- a/README.md +++ b/README.md @@ -124,11 +124,11 @@ git clone https://gitlab.com/healthdatahub/schema-snds.git 2. Create the core database (with nomenclatures) * Run the script `prepare.py` that cleans the SNDS schemas (fix some typos, etc.) ```bash - python SNDSGenerator/prepare.py + python Data_Preparation/prepare.py ``` * Run the script `create_nomenclature.py` to create the nomenclature part of the SNDS. This script create a large database made of the SNDS tables that can be provided without access restriction. This part of the database is provided by the Health Data Hub. ```bash - python SNDSGenerator/create_nomenclatures.py + python Data_Preparation/create_nomenclatures.py mkdir datarep mv snds_nomenclature.db datarep/ ``` @@ -147,11 +147,11 @@ unzip populational_distributions.zip -d datarep/ * Each step is detailed in corresponding Notebooks in the `Data_Analysis` repository (which explains the details of the computation and draw Figures to assess the approximations) 4. Run the `generate_avcdataset.py` script to run the simulator based on Open Data. This script can be modified to setup some parameters if the simulation, for instance: the population size, the number of physicians, the administrative regions to mimic, etc. ```bash -python AVCGenerator/generate_avcdataset.py +python Models/AVC/generate_avcdataset.py ``` 5. Then ... you have a database in the `AVCGenerator` directory that can be browsed. For instance: ```bash -sqlitebrowser AVCGenerator/snds_testgen.db & +sqlitebrowser Models/AVC/snds_testgen.db & ``` ## Open data resources diff --git a/SNDSGenerator/README.md b/SNDSGenerator/README.md deleted file mode 100644 index 04f25c36d17fe61d94a94161ee496d2562b67578..0000000000000000000000000000000000000000 --- a/SNDSGenerator/README.md +++ /dev/null @@ -1,50 +0,0 @@ - -_author:_ Thomas Guyet (Inria) -_date:_ 12/12/2022 - - -## Project objective and steps - -The main objective of the project is to simulate a synthetic dataset of care pathways that may have some (statistical) similarities with a real dataset and that has some privacy guarantees. -More especially, we focus on the SNDS database schema. - -In order to acheive this goal, we propose to organize the project in 3 main steps - -* generating purely random data in the schema of the SNDS database (**on-going**) -* generating random data that reproduce some statistical distributions - 1. reproduce simple global statistical features such as the distribution of medics in the whole population, distribution of genders, distribution of physicians, geographic distributions, etc. Here, the specificy is to reproduce aggregated statistics that are available in open data. The challenge here is to have statistical processes to would approxiate the joint distributions! - 2. reproduce some characteristics of care pathways. At this stage, the objective is to reproduce some count statistics at the individual level. This objective requierts to have an access to real care pathways from which extract the statistical characteristics to reproduce. - 3. generate realistic care pathways (and collection of patients with different types of pathways), including the sequentiality and the delays between cares. Statistical random generation processes may be blended with rule based generation processes to reproduce care trajectories. -* generating data with privacy guarantees. The steps 2. and 3. of the previous main step may be - -## Simulation principle - -To generate random data in the schema of the SNDS, there are different aspects to consider: -* the *medical environment*: it represents the types of cares or drugs that can be delivered, the geography, the temporal bourdaries of the simulation. All this information can be considered as static and available in the nomenclaure -* the *care offer*: it represents the set of care providers (physicians, pharmacists, GPs, specialists, hospital...). The list of hospital in known, but the list of care provider has to be randomly generated -* the *population*: it represents the set of patients to which cares have been delivered (or not). -* the *care pathways*: it represent the sequence of cares delivered for a patient - -For each of these aspects, there are specific data to reuse or to generate. - -From the computer enginering point of view, the main idea of the tool is to have two different layers: -* a theoretical model of a popution/environment/cares: it is an internal model that we start to organize be listing the large four categories of entities -* a concrete data generation model that is able to concretise an instance of the model in a data format. The first target is a SNDS database (we choose a SQLite database and the database schema provided by the HDH). The second target is a RDF schema. - - -The theoretical model may have different shapes: statistical model, object oriented model, multi-agent model, ... - -As a computer scientist (and pragmatically) I lean toward an object oriented model. - -## A object oriented theoretical model - -The model is given in the `database_model.py` file which describes a different classes that represents the entities of our model. -The `data_factory.py` file holds classes that are factories of the classes in the model. This factories may be implemented with the different strastegies we introduced above (from random to realistic). - -The model we propose is made of the following main classes: -* `PS` a care provider that has sub-classes - * `Provider` that, at the time, represents the pharmacists - * `Physician` that can be a `GP` or a `Specialist` -* `Patient` -* `CareDelivery` that corresponds to the different types of care a patient may receive (drug deliveries, medical acts, GP visits, hospital stays, etc.). The categories of cares are guided by the SNDS stucture that model outpatient trajectories. - * `DrugDelivery` diff --git a/SNDSGenerator/test.py b/SNDSGenerator/test.py deleted file mode 100644 index 9994cb565d4ddc9b0635c03d024e96d6ed1a0837..0000000000000000000000000000000000000000 --- a/SNDSGenerator/test.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on Sun May 3 11:07:38 2020 - -@author: tguyet -""" - -from data_factory import FactoryContext, PatientFactory, PhysicianFactory, PharmacyFactory, DrugsDeliveryFactory -import os - -os.chdir("/home/tguyet/Progs/medtrajectory_datagen/Generator") - - -context=FactoryContext() - -pfactory=PharmacyFactory(context) -pharms = pfactory.generate(10) -for p in pharms: - print(p) - -pfactory=PhysicianFactory(context) -GPs = pfactory.generateGP(5) -specialists = pfactory.generateSpecialists(10) -for p in GPs+specialists: - print(p) - -#pfactory=PatientFactory(context, [p.id for p in GPs]) -pfactory=PatientFactory(context, GPs) -patients = pfactory.generate(10) -for p in patients: - print(p) - -drugfact=DrugsDeliveryFactory(context,pharms) -for p in patients: - drugfact.generate(p,50) - for dd in p.drugdeliveries: - print(dd) \ No newline at end of file diff --git a/SNDSGenerator/test_McKenna.py b/SNDSGenerator/test_McKenna.py deleted file mode 100644 index 554a079be0180aa4ea13e2f12a765bfa0c730f12..0000000000000000000000000000000000000000 --- a/SNDSGenerator/test_McKenna.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" - -In addition to the requirements, this script requires to install Private-PGM. -Under Ubuntu system, you can simply run this command to download and install it - -pip3 install git+https://github.com/ryan112358/private-pgm.git - -""" - -from simuExternal import simuExternal -from simulationDB import simDB -import pandas as pd -import numpy as np -from mbi import Dataset, Domain, FactoredInference - - -print('############## Domain ################') -# Define the characteristics of the dataset to generate on the basis of a generation of 1000 patients - -nb_patients_togenerate=1000 -nb_drugs_deliveries=12*nb_patients_togenerate -#12: mean number of deliveries (example par patients) - -domain = Domain(['id','age','sex','RR', 'ATC'], [nb_patients_togenerate,20,2,18,1117]) -#id: id of the patient -#age/sex/RR: characteristics of the - -print('############## Data Preparation #############') - -#Marginals are evaluated on the basis of 10000 patients - -# Define a marginal based on the triple 'age', 'sex' and 'RR' (population) -pop = pd.read_csv("../datarep/pop.csv").drop(['Unnamed: 0'],axis=1) -pop = pop.dropna() -pop['pop'] /= np.sum(pop['pop'])/10000 -marginal_ASL = pop.groupby(['age','sex','RR']).aggregate({'pop':'sum'}).reset_index() - -#save the modalities -ages= marginal_ASL['age'].drop_duplicates().to_list() -sexes= marginal_ASL['sex'].drop_duplicates().to_list() -RRs = marginal_ASL['RR'].drop_duplicates().to_list() - -marginal_ASL = marginal_ASL['pop'].to_numpy() - -# a marginal for the number of deliveries (normal distribution) -# : 12: mean number of deliveries -marginal_id = np.maximum(0,np.random.normal(12,1,nb_patients_togenerate)) - - -drugs_freq=pd.read_csv("../datarep/drugs_freq.csv").drop(['Unnamed: 0'],axis=1) -ATCs = drugs_freq['ATC5'].drop_duplicates().to_list() -cips=drugs_freq[['ATC5','CIP13']].set_index('ATC5') - -print('################### Modelisation ###############') -epsilon = 1.0 -sigma = 1.0 / epsilon -Iasrr = np.eye( 20 * 2 * 18 ) -Ip = np.eye( nb_patients_togenerate ) -measurements = [ (Iasrr, marginal_ASL, sigma, ('age','sex','RR')), - (Ip, marginal_id, sigma, ('id'))] - -engine = FactoredInference(domain, log=True) -model = engine.estimate(measurements, engine='MD') - - -print('################## Generation of a new dataset ##############') - -synthdata = model.synthetic_data(rows=10000) -print(synthdata.df) - -#check the numbers of deliveries -synthdata.df.groupby('id').size().drop_duplicates() - - -print('################# Transformation of the synthetic database ###############') -# -> reprendre des - -def generate_line(x): - age=ages[x['age']] - sex=sexes[x['sex']] - RR=RRs[x['RR']] - #chose one of the CIP in the ATC class: - try: - CIP=np.random.choice(cips.loc[ ATCs[x['ATC']] ]['CIP13'].to_list()) - except AttributeError: - CIP=cips.loc[ ATCs[x['ATC']] ]['CIP13'] - - - return {'patient':x['id'], 'age':age, 'sex':sex, 'RR':RR, 'CIP13':CIP} - -#apply the transformation (a bit long) -ret = synthdata.df.apply(generate_line, axis=1) - -drugs=pd.DataFrame(ret.to_list()) - -print('################## Save the database into the SNDS ################') -population = drugs[['patient','age','sex']].drop_duplicates().set_index('patient') - -# representation of a simulation with datasets -simu = { - "patients": { - "data":population, - "attributes":{'sex':'sex', 'age':'age'} - }, - "drugs": { - "data":drugs, - "attributes":{'pid':'patient','cip':'CIP13'} - } -} - -sim = simuExternal(nomencl="../datarep/snds_nomenclature.db", - datarep="../datarep") -sim.run() - -dbgen = simDB() -sim.load(simu) - -for p in sim.patients: - for dd in p.drugdeliveries: - print(dd) - for dd in p.visits: - print(dd) - for dd in p.medicalacts: - print(dd) - for dd in p.hospitalStays: - print(dd) - print("========") - -dbgen.generate(sim, rootschemas="../schema-snds/schemas") diff --git a/cims/cim_codes.csv b/data/cims/cim_codes.csv similarity index 100% rename from cims/cim_codes.csv rename to data/cims/cim_codes.csv diff --git a/cims/load_cim.py b/data/cims/load_cim.py similarity index 100% rename from cims/load_cim.py rename to data/cims/load_cim.py