Я думаю, что на самом деле здесь есть два шага:
1. read multi files into DataFrame
2. use cluster to deal with DataFrame in step1
Поскольку @denyce дал пример для локального, я мог бы привести пример для шага 1 с AWS S3
import boto3
def f(bucket, key, region_name, access_mode):
s3_resource = boto3.resource('s3', region_name=region_name)
s3_bucket = s3_resource.Bucket(bucket)
df_list = []
s3_objs = s3_bucket.objects.filter(Prefix=key)
for s3_prefix_obj in s3_objs:
s3_prefix_df = s3_prefix_obj.get()['Body'].read()
# some medium work
df_list.append(s3_prefix_df)
# combine data together
df = pd.concat(s3_prefix_df_list)
# step2, do cluster as you described, now df contains all files in s3 folder
can=df.drop(columns =['pat'])
dbscan=DBSCAN(eps=3,min_samples=4)
X = can.iloc[:, [1,2,3,4]].values
....