Set Up Your First Dataset

Datasource Access

The Lightly Worker reads input data from your cloud storage directly and uploads the results back to your cloud storage again. Therefore, you must define an input and Lightly bucket. Lightly currently supports AWS S3, Google Cloud Storage, and Azure as cloud providers.

Learn more about the input and Lightly bucket: Cloud Storage.

📘

Setting up your first dataset and running your first selection can also be done in one simple python script you can download here.

Create a Dataset

A new dataset can be easily created from the Python client. You can see all your datasets in the Lightly Platform.

from lightly.api import ApiWorkflowClient
from lightly.openapi_generated.swagger_client import DatasetType

# Create the Lightly client to connect to the API.
client = ApiWorkflowClient(token="MY_LIGHTLY_TOKEN")

# Create a new dataset on the Lightly Platform.
client.create_dataset(
    dataset_name="dataset-name",
    dataset_type=DatasetType.IMAGES  # can be DatasetType.VIDEOS when working with videos
)
my_dataset_id = client.dataset_id
print(my_dataset_id)

After creating the dataset, you can configure the datasource for it. To get your configuration attributes, you must follow the steps to create a datasource for AWS, GCP or Azure.

from lightly.api import ApiWorkflowClient
from lightly.openapi_generated.swagger_client import DatasourcePurpose

# Configure the client to use the dataset ID created above.
client = ApiWorkflowClient(token="MY_LIGHTLY_TOKEN")
client.dataset_id = "MY_DATASET_ID"

# Configure the Input datasource.
client.set_s3_delegated_access_config(
    resource_path="s3://bucket/input/",
    region="eu-central-1",
    role_arn="S3-ROLE-ARN",
    external_id="S3-EXTERNAL-ID",
    purpose=DatasourcePurpose.INPUT
)
# Configure the Lightly datasource.
client.set_s3_delegated_access_config(
    resource_path="s3://bucket/lightly/",
    region="eu-central-1",
    role_arn="S3-ROLE-ARN",
    external_id="S3-EXTERNAL-ID",
    purpose=DatasourcePurpose.LIGHTLY
)
from lightly.api import ApiWorkflowClient
from lightly.openapi_generated.swagger_client import DatasourcePurpose

# Configure the client to use the dataset ID created above.
client = ApiWorkflowClient(token="MY_LIGHTLY_TOKEN")
client.dataset_id = "MY_DATASET_ID"

# Configure the Input datasource.
client.set_s3_config(
    resource_path="s3://bucket/input/",
    region="eu-central-1",
    access_key="S3-ACCESS-KEY",
    secret_access_key="S3-SECRET-ACCESS-KEY",
    purpose=DatasourcePurpose.INPUT
)
# Configure the Lightly datasource.
client.set_s3_config(
    resource_path="s3://bucket/lightly/",
    region="eu-central-1",
    access_key="S3-ACCESS-KEY",
    secret_access_key="S3-SECRET-ACCESS-KEY",
    purpose=DatasourcePurpose.LIGHTLY
)
import json
from lightly.api import ApiWorkflowClient
from lightly.openapi_generated.swagger_client import DatasourcePurpose

# Configure the client to use the dataset ID created above.
client = ApiWorkflowClient(token="MY_LIGHTLY_TOKEN")
client.dataset_id = "MY_DATASET_ID"

# Configure the Input datasource.
client.set_gcs_config(
    resource_path="gs://bucket/input/",
    project_id="PROJECT-ID",
    credentials=json.dumps(json.load(open("credentials_read.json"))),
    purpose=DatasourcePurpose.INPUT,
)
# Configure the Lightly datasource.
client.set_gcs_config(
    resource_path="gs://bucket/lightly/",
    project_id="PROJECT-ID",
    credentials=json.dumps(json.load(open("credentials_write.json"))),
    purpose=DatasourcePurpose.LIGHTLY,
)
from lightly.api import ApiWorkflowClient
from lightly.openapi_generated.swagger_client import DatasourcePurpose

# Configure the client to use the dataset ID created above.
client = ApiWorkflowClient(token="MY_LIGHTLY_TOKEN")
client.dataset_id = "MY_DATASET_ID"

# Configure the Input datasource.
client.set_azure_config(
    container_name="my-container/input/",
    account_name="ACCOUNT-NAME",
    sas_token="SAS-TOKEN",
    purpose=DatasourcePurpose.INPUT,
)
# Configure the Lightly datasource.
client.set_azure_config(
    container_name="my-container/lightly/",
    account_name="ACCOUNT-NAME",
    sas_token="SAS-TOKEN",
    purpose=DatasourcePurpose.LIGHTLY,
)
from lightly.api import ApiWorkflowClient
from lightly.openapi_generated.swagger_client import DatasourcePurpose

# Configure the client to use the dataset ID created above.
client = ApiWorkflowClient(token="MY_LIGHTLY_TOKEN")
client.dataset_id = "MY_DATASET_ID"

# Configure the Input datasource.
client.set_obs_config(
    resource_path="obs://bucket/input/",
    obs_endpoint="https://obs-endpoint-of-your-cloud-provider.com",
    obs_access_key_id="OBS-ACCESS-KEY",
    obs_secret_access_key="OBS-SECRET-ACCESS-KEY",
    purpose=DatasourcePurpose.INPUT,
)
# Configure the Lightly datasource.
client.set_obs_config(
    resource_path="obs://bucket/lightly/",
    obs_endpoint="https://obs-endpoint-of-your-cloud-provider.com",
    obs_access_key_id="OBS-ACCESS-KEY",
    obs_secret_access_key="OBS-SECRET-ACCESS-KEY",
    purpose=DatasourcePurpose.LIGHTLY,
)

🚧

The credentials passed above need to provide Lightly with list and read access to the input bucket and with list, read, and write access to the Lightly bucket. See Google Cloud Storage, AWS S3, and Azure for help with configuring the different roles.


What’s Next