Я пытаюсь прочитать файл .csv из pyspark.
Я настроил виртуальную машину в одном проекте и корзину в другом.
Вот что я делаю:
import numpy as np
import pandas as pd
import findspark
findspark.init('/usr/lib/spark/')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fs
from google.cloud import storage
from pyspark import SparkContext
import os
path = 'myBucket-c892b51f8579.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = path
client = storage.Client()
name = 'https://console.cloud.google.com/storage/browser/myBucket/'
bucket_id = 'myBucket'
bucket = client.get_bucket(bucket_id)
sc = SparkContext()
spark = SparkSession(sc)
#spark.conf.set('spark.jars.packages', 'com.google.cloud.bigdataoss:gcs-connector:hadoop2-2.1.2')
spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', "myBucket-c892b51f8579.json")
# Following are required if you are using oAuth
spark._jsc.hadoopConfiguration().set('fs.gs.auth.client.id', '1166340800441514xxxxx')
spark._jsc.hadoopConfiguration().set('fs.gs.auth.client.secret', 'c892b51f85798cb8dcc57d0011fa76beabaxxxxx')
Я могу прочитать файл как pandas датафрейм, но не с pyspark.
for i, file in enumerate(client.list_blobs(bucket_id, prefix='myFolder')):
file_path="gs://{}/{}".format(file.bucket.name, file.name)
if i == 0:
df = pd.read_csv(file_path)
print("Print Datataframe from pandas\n") #### PRINT DATAFRAME
print(df.head(5))
df1 = spark.read.format('csv').option("header", "true").option("inferSchema", "true").option("mode","DROPMALFORMED").load([file_path])
Это вывод
Print Datataframe from pandas
identifier identifier_type timestamp time_zone_name ... province_short ip_address device_horizontal_accuracy source_id
0 ea48f283-494f-4e84-8369-94c0340b749e idfa 2020-03-18 23:59:16 UTC America/Chicago ... US.TX 99.43.106.241 18.000000 6d7f114f73b49b40aa15260a856b51068b9181ea628f71...
1 9b91c8a3-468f-457c-9837-af6118a8f034 idfa 2020-03-18 05:58:20 UTC America/Chicago ... US.TX 73.232.180.67 7.000000 2051ff772f65f048db9f2f32c2d02b548599f2a3e4254c...
2 58bddd43-7cdd-4cf5-9de7-bee91a04153c idfa 2020-03-19 05:00:01 UTC America/Los_Angeles ... US.CA 71.95.63.254 10.000000 6d7f114f73b49b40aa15260a856b51068b9181ea628f71...
3 985fbdb5-86fe-4ede-9ef9-aeeda57b06a3 idfa 2020-03-18 13:51:40 UTC America/Chicago ... US.TX 2600:387:a:9::78 1627.838517 cda89525af15a7e5ede988453fe40b348163bdf56fcd66...
4 c2be7747-2bb2-4d4c-9110-442d3837b99c gaid 2020-03-18 07:53:08 UTC America/Los_Angeles ... US.CA 172.58.39.47 8.000000 9e6c1827914b40ea5d2a437ebc4a9cd6ef486b4688a304...
py4j.protocol.Py4JJavaError: An error occurred while calling o54.load.
: java.io.IOException: Error accessing Bucket myVirtualMachine-staging-us-central1-939717789215-lkntecyc
Caused by: com.google.cloud.hadoop.repackaged.gcs.com.google.api.client.googleapis.json.GoogleJsonResponseException: 403 Forbidden
{
"code" : 403,
"errors" : [ {
"domain" : "global",
"message" : "myName@myBucket.iam.gserviceaccount.com does not have storage.buckets.get access to myVirtualMachine-staging-us-central1-939717789215-lkntecyc.",
"reason" : "forbidden"
} ],
"message" : "emanmyNameuele@myBucketiam.gserviceaccount.com does not have storage.buckets.get access to myVirtualMachine-staging-us-central1-939717789215-lkntecyc."
}