Quickstart¶
Basic¶
The best way to learn R
-style formula syntax with ydot
is to head on over to patsy [pat] and read the documentation. Below, we show very simple code to transform a Spark dataframe into two design matrices (these are also Spark dataframes), y
and X
, using a formula that defines a model up to two-way interactions.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import random
from random import choice
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from ydot.spark import smatrices
random.seed(37)
np.random.seed(37)
def get_spark_dataframe(spark):
n = 100
data = {
'a': [choice(['left', 'right']) for _ in range(n)],
'b': [choice(['high', 'mid', 'low']) for _ in range(n)],
'x1': np.random.normal(20, 1, n),
'x2': np.random.normal(3, 1, n),
'y': [choice([1.0, 0.0]) for _ in range(n)]
}
pdf = pd.DataFrame(data)
sdf = spark.createDataFrame(pdf)
return sdf
if __name__ == '__main__':
try:
spark = (SparkSession.builder
.master('local[4]')
.appName('local-testing-pyspark')
.getOrCreate())
sdf = get_spark_dataframe(spark)
y, X = smatrices('y ~ (x1 + x2 + a + b)**2', sdf)
y = y.toPandas()
X = X.toPandas()
print(X.head(10))
X.head(10).to_csv('two-way-interactions.csv', index=False)
except Exception as e:
print(e)
finally:
try:
spark.stop()
print('closed spark')
except Exception as e:
print(e)
|
More¶
We use the code below to generate the models (data) below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | import random
from random import choice
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from ydot.spark import smatrices
random.seed(37)
np.random.seed(37)
def get_spark_dataframe(spark):
n = 100
data = {
'a': [choice(['left', 'right']) for _ in range(n)],
'b': [choice(['high', 'mid', 'low']) for _ in range(n)],
'x1': np.random.normal(20, 1, n),
'x2': np.random.normal(3, 1, n),
'y': [choice([1.0, 0.0]) for _ in range(n)]
}
pdf = pd.DataFrame(data)
sdf = spark.createDataFrame(pdf)
return sdf
if __name__ == '__main__':
try:
spark = (SparkSession.builder
.master('local[4]')
.appName('local-testing-pyspark')
.getOrCreate())
sdf = get_spark_dataframe(spark)
formulas = [
{
'f': 'y ~ np.sin(x1) + np.cos(x2) + a + b',
'o': 'transformed-continuous.csv'
},
{
'f': 'y ~ x1*x2',
'o': 'star-con-interaction.csv'
},
{
'f': 'y ~ a*b',
'o': 'star-cat-interaction.csv'
},
{
'f': 'y ~ x1:x2',
'o': 'colon-con-interaction.csv'
},
{
'f': 'y ~ a:b',
'o': 'colon-cat-interaction.csv'
},
{
'f': 'y ~ (x1 + x2) / (a + b)',
'o': 'divide-interaction.csv'
},
{
'f': 'y ~ x1 + x2 + a - 1',
'o': 'no-intercept.csv'
}
]
for item in formulas:
f = item['f']
o = item['o']
y, X = smatrices(f, sdf)
y = y.toPandas()
X = X.toPandas()
X.head(5).to_csv(o, index=False)
s = f"""
.. csv-table:: {f}
:file: _code/{o}
:header-rows: 1
"""
print(s.strip())
except Exception as e:
print(e)
finally:
try:
spark.stop()
print('closed spark')
except Exception as e:
print(e)
|
You can use numpy
functions against continuous variables.
Intercept |
a[T.right] |
b[T.low] |
b[T.mid] |
np.sin(x1) |
np.cos(x2) |
---|---|---|---|---|---|
1.0 |
0.0 |
1.0 |
0.0 |
0.8893769205406579 |
-0.758004200582313 |
1.0 |
0.0 |
1.0 |
0.0 |
0.9679261582216445 |
-0.5759807266894401 |
1.0 |
1.0 |
0.0 |
0.0 |
0.9972849995254774 |
-0.9086185088676886 |
1.0 |
1.0 |
0.0 |
1.0 |
-0.14934132364604816 |
0.4783416124776783 |
1.0 |
0.0 |
1.0 |
0.0 |
0.45523550315103734 |
-0.7588816501987654 |
The *
specifies interactions and keeps lower order terms.
Intercept |
x1 |
x2 |
x1:x2 |
---|---|---|---|
1.0 |
19.945536387662504 |
3.85214120038979 |
76.83302248278848 |
1.0 |
20.674308066353493 |
4.098585619118175 |
84.73542172597531 |
1.0 |
20.346647025958433 |
2.7107604387194626 |
55.154885818557126 |
1.0 |
18.699653829045985 |
5.2111542692543065 |
97.44678088481062 |
1.0 |
21.51851187887476 |
2.432390426907621 |
52.341422295472896 |
Intercept |
a[T.right] |
b[T.low] |
b[T.mid] |
a[T.right]:b[T.low] |
a[T.right]:b[T.mid] |
---|---|---|---|---|---|
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
0.0 |
1.0 |
0.0 |
1.0 |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
The :
specifies interactions and drops lower order terms.
Intercept |
x1:x2 |
---|---|
1.0 |
76.83302248278848 |
1.0 |
84.73542172597531 |
1.0 |
55.154885818557126 |
1.0 |
97.44678088481062 |
1.0 |
52.341422295472896 |
Intercept |
b[T.low] |
b[T.mid] |
a[T.right]:b[high] |
a[T.right]:b[low] |
a[T.right]:b[mid] |
---|---|---|---|---|---|
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
The /
is quirky according to the patsy documentation, but it is shorthand for a / b = a + a:b
.
Intercept |
x1 |
x2 |
x1:x2:a[left] |
x1:x2:a[right] |
x1:x2:b[T.low] |
x1:x2:b[T.mid] |
---|---|---|---|---|---|---|
1.0 |
19.945536387662504 |
3.85214120038979 |
76.83302248278848 |
0.0 |
76.83302248278848 |
0.0 |
1.0 |
20.674308066353493 |
4.098585619118175 |
84.73542172597531 |
0.0 |
84.73542172597531 |
0.0 |
1.0 |
20.346647025958433 |
2.7107604387194626 |
0.0 |
55.154885818557126 |
0.0 |
0.0 |
1.0 |
18.699653829045985 |
5.2111542692543065 |
0.0 |
97.44678088481062 |
0.0 |
97.44678088481062 |
1.0 |
21.51851187887476 |
2.432390426907621 |
52.341422295472896 |
0.0 |
52.341422295472896 |
0.0 |
If you need to drop the Intercept
, add - 1
at the end. Note that one of the dummy variables for a
is not dropped. This could be a bug with patsy.
a[left] |
a[right] |
x1 |
x2 |
---|---|---|---|
1.0 |
0.0 |
19.945536387662504 |
3.85214120038979 |
1.0 |
0.0 |
20.674308066353493 |
4.098585619118175 |
0.0 |
1.0 |
20.346647025958433 |
2.7107604387194626 |
0.0 |
1.0 |
18.699653829045985 |
5.2111542692543065 |
1.0 |
0.0 |
21.51851187887476 |
2.432390426907621 |