|
1 | 1 | # %% [markdown] |
2 | | -# # Tasks: retrieving splits |
3 | | - |
4 | | -# Tasks define a target and a train/test split. Normally, they are the input to the function |
5 | | -# ``openml.runs.run_model_on_task`` which automatically runs the model on all splits of the task. |
6 | | -# However, sometimes it is necessary to manually split a dataset to perform experiments outside of |
7 | | -# the functions provided by OpenML. One such example is in the benchmark library |
8 | | -# [HPOBench](https://github.com/automl/HPOBench) which extensively uses data from OpenML, |
9 | | -# but not OpenML's functionality to conduct runs. |
10 | | - |
| 2 | +# Tasks define a target and a train/test split, which we can use for benchmarking. |
11 | 3 |
|
12 | 4 | # %% |
13 | 5 | import openml |
|
45 | 37 |
|
46 | 38 | # %% |
47 | 39 | print( |
48 | | - "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( |
49 | | - task_id, |
50 | | - n_repeats, |
51 | | - n_folds, |
52 | | - n_samples, |
53 | | - ) |
| 40 | + f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." |
54 | 41 | ) |
55 | 42 |
|
56 | 43 | # %% [markdown] |
|
72 | 59 | # And then split the data based on this: |
73 | 60 |
|
74 | 61 | # %% |
75 | | -X, y = task.get_X_and_y(dataset_format="dataframe") |
| 62 | +X, y = task.get_X_and_y() |
76 | 63 | X_train = X.iloc[train_indices] |
77 | 64 | y_train = y.iloc[train_indices] |
78 | 65 | X_test = X.iloc[test_indices] |
79 | 66 | y_test = y.iloc[test_indices] |
80 | 67 |
|
81 | 68 | print( |
82 | | - "X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format( |
83 | | - X_train.shape, |
84 | | - y_train.shape, |
85 | | - X_test.shape, |
86 | | - y_test.shape, |
87 | | - ) |
| 69 | + f"X_train.shape: {X_train.shape}, y_train.shape: {y_train.shape}, X_test.shape: {X_test.shape}, y_test.shape: {y_test.shape}" |
88 | 70 | ) |
89 | 71 |
|
90 | 72 | # %% [markdown] |
|
96 | 78 | X, y = task.get_X_and_y() |
97 | 79 | n_repeats, n_folds, n_samples = task.get_split_dimensions() |
98 | 80 | print( |
99 | | - "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( |
100 | | - task_id, |
101 | | - n_repeats, |
102 | | - n_folds, |
103 | | - n_samples, |
104 | | - ) |
| 81 | + f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." |
105 | 82 | ) |
106 | 83 |
|
107 | 84 | # %% [markdown] |
|
122 | 99 | y_test = y.iloc[test_indices] |
123 | 100 |
|
124 | 101 | print( |
125 | | - "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " |
126 | | - "y_train.shape {}, X_test.shape {}, y_test.shape {}".format( |
127 | | - repeat_idx, |
128 | | - fold_idx, |
129 | | - sample_idx, |
130 | | - X_train.shape, |
131 | | - y_train.shape, |
132 | | - X_test.shape, |
133 | | - y_test.shape, |
134 | | - ) |
| 102 | + f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, " |
| 103 | + f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}" |
135 | 104 | ) |
136 | 105 |
|
137 | 106 | # %% [markdown] |
|
143 | 112 | X, y = task.get_X_and_y() |
144 | 113 | n_repeats, n_folds, n_samples = task.get_split_dimensions() |
145 | 114 | print( |
146 | | - "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( |
147 | | - task_id, |
148 | | - n_repeats, |
149 | | - n_folds, |
150 | | - n_samples, |
151 | | - ) |
| 115 | + f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." |
152 | 116 | ) |
153 | 117 |
|
154 | 118 | # %% [markdown] |
|
169 | 133 | y_test = y.iloc[test_indices] |
170 | 134 |
|
171 | 135 | print( |
172 | | - "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " |
173 | | - "y_train.shape {}, X_test.shape {}, y_test.shape {}".format( |
174 | | - repeat_idx, |
175 | | - fold_idx, |
176 | | - sample_idx, |
177 | | - X_train.shape, |
178 | | - y_train.shape, |
179 | | - X_test.shape, |
180 | | - y_test.shape, |
181 | | - ) |
| 136 | + f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, " |
| 137 | + f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}" |
182 | 138 | ) |
183 | 139 |
|
184 | 140 | # %% [markdown] |
|
190 | 146 | X, y = task.get_X_and_y() |
191 | 147 | n_repeats, n_folds, n_samples = task.get_split_dimensions() |
192 | 148 | print( |
193 | | - "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( |
194 | | - task_id, |
195 | | - n_repeats, |
196 | | - n_folds, |
197 | | - n_samples, |
198 | | - ) |
| 149 | + f"Task {task_id}: number of repeats: {n_repeats}, number of folds: {n_folds}, number of samples {n_samples}." |
199 | 150 | ) |
200 | 151 |
|
201 | 152 | # %% [markdown] |
|
216 | 167 | y_test = y.iloc[test_indices] |
217 | 168 |
|
218 | 169 | print( |
219 | | - "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " |
220 | | - "y_train.shape {}, X_test.shape {}, y_test.shape {}".format( |
221 | | - repeat_idx, |
222 | | - fold_idx, |
223 | | - sample_idx, |
224 | | - X_train.shape, |
225 | | - y_train.shape, |
226 | | - X_test.shape, |
227 | | - y_test.shape, |
228 | | - ) |
| 170 | + f"Repeat #{repeat_idx}, fold #{fold_idx}, samples {sample_idx}: X_train.shape: {X_train.shape}, " |
| 171 | + f"y_train.shape {y_train.shape}, X_test.shape {X_test.shape}, y_test.shape {y_test.shape}" |
229 | 172 | ) |
230 | | -# License: BSD 3-Clause |
0 commit comments