Quickstart

Basic

The best way to learn R-style formula syntax with ydot is to head on over to patsy [pat] and read the documentation. Below, we show very simple code to transform a Spark dataframe into two design matrices (these are also Spark dataframes), y and X, using a formula that defines a model up to two-way interactions.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import random
from random import choice

import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

from ydot.spark import smatrices

random.seed(37)
np.random.seed(37)


def get_spark_dataframe(spark):
    n = 100
    data = {
        'a': [choice(['left', 'right']) for _ in range(n)],
        'b': [choice(['high', 'mid', 'low']) for _ in range(n)],
        'x1': np.random.normal(20, 1, n),
        'x2': np.random.normal(3, 1, n),
        'y': [choice([1.0, 0.0]) for _ in range(n)]
    }
    pdf = pd.DataFrame(data)

    sdf = spark.createDataFrame(pdf)
    return sdf


if __name__ == '__main__':
    try:
        spark = (SparkSession.builder
                 .master('local[4]')
                 .appName('local-testing-pyspark')
                 .getOrCreate())
        sdf = get_spark_dataframe(spark)

        y, X = smatrices('y ~ (x1 + x2 + a + b)**2', sdf)
        y = y.toPandas()
        X = X.toPandas()

        print(X.head(10))
        X.head(10).to_csv('two-way-interactions.csv', index=False)
    except Exception as e:
        print(e)
    finally:
        try:
            spark.stop()
            print('closed spark')
        except Exception as e:
            print(e)

More

We use the code below to generate the models (data) below.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import random
from random import choice

import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

from ydot.spark import smatrices

random.seed(37)
np.random.seed(37)


def get_spark_dataframe(spark):
    n = 100
    data = {
        'a': [choice(['left', 'right']) for _ in range(n)],
        'b': [choice(['high', 'mid', 'low']) for _ in range(n)],
        'x1': np.random.normal(20, 1, n),
        'x2': np.random.normal(3, 1, n),
        'y': [choice([1.0, 0.0]) for _ in range(n)]
    }
    pdf = pd.DataFrame(data)

    sdf = spark.createDataFrame(pdf)
    return sdf


if __name__ == '__main__':
    try:
        spark = (SparkSession.builder
                 .master('local[4]')
                 .appName('local-testing-pyspark')
                 .getOrCreate())
        sdf = get_spark_dataframe(spark)

        formulas = [
            {
                'f': 'y ~ np.sin(x1) + np.cos(x2) + a + b',
                'o': 'transformed-continuous.csv'
            },
            {
                'f': 'y ~ x1*x2',
                'o': 'star-con-interaction.csv'
            },
            {
                'f': 'y ~ a*b',
                'o': 'star-cat-interaction.csv'
            },
            {
                'f': 'y ~ x1:x2',
                'o': 'colon-con-interaction.csv'
            },
            {
                'f': 'y ~ a:b',
                'o': 'colon-cat-interaction.csv'
            },
            {
                'f': 'y ~ (x1 + x2) / (a + b)',
                'o': 'divide-interaction.csv'
            },
            {
                'f': 'y ~ x1 + x2 + a - 1',
                'o': 'no-intercept.csv'
            }
        ]

        for item in formulas:
            f = item['f']
            o = item['o']

            y, X = smatrices(f, sdf)
            y = y.toPandas()
            X = X.toPandas()

            X.head(5).to_csv(o, index=False)

            s = f"""
            .. csv-table:: {f}
               :file: _code/{o}
               :header-rows: 1
            """
            print(s.strip())
    except Exception as e:
        print(e)
    finally:
        try:
            spark.stop()
            print('closed spark')
        except Exception as e:
            print(e)

You can use numpy functions against continuous variables.

y ~ np.sin(x1) + np.cos(x2) + a + b

Intercept

a[T.right]

b[T.low]

b[T.mid]

np.sin(x1)

np.cos(x2)

1.0

0.0

1.0

0.0

0.8893769205406579

-0.758004200582313

1.0

0.0

1.0

0.0

0.9679261582216445

-0.5759807266894401

1.0

1.0

0.0

0.0

0.9972849995254774

-0.9086185088676886

1.0

1.0

0.0

1.0

-0.14934132364604816

0.4783416124776783

1.0

0.0

1.0

0.0

0.45523550315103734

-0.7588816501987654

The * specifies interactions and keeps lower order terms.

y ~ x1*x2

Intercept

x1

x2

x1:x2

1.0

19.945536387662504

3.85214120038979

76.83302248278848

1.0

20.674308066353493

4.098585619118175

84.73542172597531

1.0

20.346647025958433

2.7107604387194626

55.154885818557126

1.0

18.699653829045985

5.2111542692543065

97.44678088481062

1.0

21.51851187887476

2.432390426907621

52.341422295472896

y ~ a*b

Intercept

a[T.right]

b[T.low]

b[T.mid]

a[T.right]:b[T.low]

a[T.right]:b[T.mid]

1.0

0.0

1.0

0.0

0.0

0.0

1.0

0.0

1.0

0.0

0.0

0.0

1.0

1.0

0.0

0.0

0.0

0.0

1.0

1.0

0.0

1.0

0.0

1.0

1.0

0.0

1.0

0.0

0.0

0.0

The : specifies interactions and drops lower order terms.

y ~ x1:x2

Intercept

x1:x2

1.0

76.83302248278848

1.0

84.73542172597531

1.0

55.154885818557126

1.0

97.44678088481062

1.0

52.341422295472896

y ~ a:b

Intercept

b[T.low]

b[T.mid]

a[T.right]:b[high]

a[T.right]:b[low]

a[T.right]:b[mid]

1.0

1.0

0.0

0.0

0.0

0.0

1.0

1.0

0.0

0.0

0.0

0.0

1.0

0.0

0.0

1.0

0.0

0.0

1.0

0.0

1.0

0.0

0.0

1.0

1.0

1.0

0.0

0.0

0.0

0.0

The / is quirky according to the patsy documentation, but it is shorthand for a / b = a + a:b.

y ~ (x1 + x2) / (a + b)

Intercept

x1

x2

x1:x2:a[left]

x1:x2:a[right]

x1:x2:b[T.low]

x1:x2:b[T.mid]

1.0

19.945536387662504

3.85214120038979

76.83302248278848

0.0

76.83302248278848

0.0

1.0

20.674308066353493

4.098585619118175

84.73542172597531

0.0

84.73542172597531

0.0

1.0

20.346647025958433

2.7107604387194626

0.0

55.154885818557126

0.0

0.0

1.0

18.699653829045985

5.2111542692543065

0.0

97.44678088481062

0.0

97.44678088481062

1.0

21.51851187887476

2.432390426907621

52.341422295472896

0.0

52.341422295472896

0.0

If you need to drop the Intercept, add - 1 at the end. Note that one of the dummy variables for a is not dropped. This could be a bug with patsy.

y ~ x1 + x2 + a - 1

a[left]

a[right]

x1

x2

1.0

0.0

19.945536387662504

3.85214120038979

1.0

0.0

20.674308066353493

4.098585619118175

0.0

1.0

20.346647025958433

2.7107604387194626

0.0

1.0

18.699653829045985

5.2111542692543065

1.0

0.0

21.51851187887476

2.432390426907621