|
3 | 3 | Datasets |
4 | 4 | ======== |
5 | 5 |
|
6 | | -A basic tutorial on how to list and download datasets. |
| 6 | +A basic tutorial on how to list, load and visualize datasets. |
7 | 7 | """ |
8 | 8 | ############################################################################ |
9 | | -import openml |
| 9 | +# In general, we recommend working with tasks, so that the results can |
| 10 | +# be easily reproduced. Furthermore, the results can be compared to existing results |
| 11 | +# at OpenML. However, for the purposes of this tutorial, we are going to work with |
| 12 | +# the datasets directly. |
10 | 13 |
|
| 14 | +import openml |
11 | 15 | ############################################################################ |
12 | 16 | # List datasets |
13 | 17 | # ============= |
|
19 | 23 | # Download a dataset |
20 | 24 | # ================== |
21 | 25 |
|
22 | | -first_dataset_id = int(datasets_df['did'].iloc[0]) |
23 | | -dataset = openml.datasets.get_dataset(first_dataset_id) |
| 26 | +# Iris dataset https://www.openml.org/d/61 |
| 27 | +dataset = openml.datasets.get_dataset(61) |
24 | 28 |
|
25 | 29 | # Print a summary |
26 | 30 | print(f"This is dataset '{dataset.name}', the target feature is " |
27 | 31 | f"'{dataset.default_target_attribute}'") |
28 | 32 | print(f"URL: {dataset.url}") |
29 | 33 | print(dataset.description[:500]) |
| 34 | + |
| 35 | +############################################################################ |
| 36 | +# Load a dataset |
| 37 | +# ============== |
| 38 | + |
| 39 | +# X - An array/dataframe where each row represents one example with |
| 40 | +# the corresponding feature values. |
| 41 | +# y - the classes for each example |
| 42 | +# categorical_indicator - an array that indicates which feature is categorical |
| 43 | +# attribute_names - the names of the features for the examples (X) and |
| 44 | +# target feature (y) |
| 45 | +X, y, categorical_indicator, attribute_names = dataset.get_data( |
| 46 | + dataset_format='dataframe', |
| 47 | + target=dataset.default_target_attribute |
| 48 | +) |
| 49 | +############################################################################ |
| 50 | +# Visualize the dataset |
| 51 | +# ===================== |
| 52 | + |
| 53 | +import pandas as pd |
| 54 | +import seaborn as sns |
| 55 | +import matplotlib.pyplot as plt |
| 56 | +sns.set_style("darkgrid") |
| 57 | + |
| 58 | + |
| 59 | +def hide_current_axis(): |
| 60 | + plt.gca().set_visible(False) |
| 61 | + |
| 62 | + |
| 63 | +# We combine all the data so that we can map the different |
| 64 | +# examples to different colors according to the classes. |
| 65 | +combined_data = pd.concat([X, y], axis=1) |
| 66 | +iris_plot = sns.pairplot(combined_data, hue="class") |
| 67 | +iris_plot.map_upper(hide_current_axis) |
| 68 | +plt.show() |
0 commit comments