Some issues Jean-François & I could help AP-HP with:
Jean-François on a big project: a graphical tool to visualize and edit the data schema, for collaboration.
Me on several small projects:
graph TD
P[Patient 1]
C[Condition 1]
D[Drug 1]
D2[Drug 2]
P -->|"with ▾"| C
C -->|"without ▾"| D
D -->|"with ▾"| D2
subgraph Patient
PC["birth < 2000-01-01
<br>sex = male
<br>[+ add filter…]"]
P --> PC
end
subgraph Condition
CC["diagnosis = COVID-19
<br>[+ add filter…]"]
C --> CC
end
subgraph Drug
DC["name = chloroquine
<br>[+ add filter…]"]
D --> DC
end
subgraph Drug
DC2["[+ add filter…]"]
D2 --> DC2
end
sequenceDiagram
participant U as UI
participant B as Backend
participant T as Solr Transiator
participant S as Spark
participant R as Solr
U->>+B: query
B->>B: determine `fl`
B->>+T: translate solr subqueries
T-->>-B: subqueries
B->>B: build DAG
B->>S: get results
S->>R: get individual dataframes
R-->>S: dataframes
S->>S: magic
S-->>B: result dataframe
B-->>-U: results
val patient = Patient(patient_df, "patient")
val condition = Condition(condition_df, "condition")
val drug1 = Drug(drug1_df, "drug1")
val drug2 = Drug(drug2_df, "drug2")
InnerJoin(
AntiJoin(
InnerJoin(
patient,
condition
),
drug1
),
drug2
).select(patient, drug2)
left
id
(configurable per class), or left_id
right
has left_id
, left_fk
or left
val patient = Patient(patient1_df, "patient")
val condition = Condition(condition1_df, "condition")
val drug1 = Drug(drug1_df, "drug1")
val drug2 = Drug(drug2_df, "drug2")
(((patient + condition) - drug1) + drug2).select(patient1, drug2)
// With added filters
(patient | patient("age") > 20)
+ (condition | condition("diagnosis") === "COVID-19")
- (drug | drug("name") === "chloroquine").alias("choloroquine")
+ drug
Available in perso/alavoillotte/spark-query-dsl
// Example: Load a Dataiku dataset into a Spark DataFrame
val message = dkuContext.getQuery(sqlContext, "message")
val topic = dkuContext.getQuery(sqlContext, "topic")
val person = dkuContext.getQuery(sqlContext, "person")
// inner join: authors & their topics
person + topic
// multiple joins: authors, their topics and their messages
person + topic + message
// anti join: people without any message
person - message
// left outer join: people, possibly with their messages
person % message
// left semi join: people that have messages (select only the person)
person ^ message
// full outer join:
person %% message
// right outer join:
person %> message
// cross join: each combination of 2 people (donut)
person * person
// selft-join is tricky, requires alias
message.alias("parent") + message.on("message_id" -> "parent_id")
// Full spark.sql.Column expressivity
person | ( person("name").when(lit("Alex"), "full quota").otherwise("ok")
&& person("born").isNotNull )
// Chainable filters
person | person("name").endsWith("e")
| person("person_id") % 2 === 1
// Filters on join, useful for similar fields
person + topics | person("created") > topic("created")
// topic with an author whose name finishes with an e
// but don't care about the author, just used for filtering
(topic + person | person("name").endsWith("e")).select(topic)
// select all with auto-prefix
(topic + person).select() // topic_title, person_name, ...
// select all with no prefix, skip duplicates
(topic + person).select(true) // title, name, ...
// person with count of distinct messages
(person % message) / person
// person with count of mandatory distinct messages & topics
(person + message + topic) / person
// union of untitled topic authors (left semi join)
// and people who have posted no messages
(person ^ (topic | topic("title").isNull))
& (person - message)
perso/alavoillotte/spark-query-dsl