Скачать данные Google BigQuery в CSV - PullRequest
0 голосов
/ 24 марта 2019

У меня есть требование для загрузки данных из Google BigQuery.На данный момент я работаю с сервисом Google BigQuery и помещаю эти данные в фрейм данных и преобразую их в CSV-файл. Реальная проблема возникает после конвертации в CSV, после конвертации в CSV, данные превращаются во вложенную форму, но когда я нажимаютот же запрос в BigWuery веб-интерфейса.Я получаю сегрегированные данные.

Я пробовал описанную выше вещь, используя pandas gbq.

from google.cloud import bigquery
import pandas as pd
import os
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('json.key')

def query_to_dataframe(query):
    return pd.read_gbq(query,
                     project_id=project_id,
                     credentials=credentials,
                     dialect='standard')

query = """
SELECT visitorId,
visitNumber,
visitId,
visitStartTime,
date,
totals.visits,
totals.hits,
totals.pageviews,
totals.timeOnSite,
totals.bounces,
totals.transactions,
totals.transactionRevenue,
totals.newVisits,
totals.screenviews,
totals.uniqueScreenviews,
totals.timeOnScreen,
totals.totalTransactionRevenue,
totals.sessionQualityDim,
trafficSource.referralPath,
trafficSource.campaign,
trafficSource.source,
trafficSource.medium,
trafficSource.keyword,
trafficSource.adContent,
trafficSource.adwordsClickInfo.campaignId,
trafficSource.adwordsClickInfo.adGroupId,
trafficSource.adwordsClickInfo.creativeId,
trafficSource.adwordsClickInfo.criteriaId,
trafficSource.adwordsClickInfo.page,
trafficSource.adwordsClickInfo.slot,
trafficSource.adwordsClickInfo.criteriaParameters,
trafficSource.adwordsClickInfo.gclId,
trafficSource.adwordsClickInfo.customerId,
trafficSource.adwordsClickInfo.adNetworkType,
trafficSource.adwordsClickInfo.targetingCriteria.boomUserlistId,
trafficSource.adwordsClickInfo.isVideoAd,
trafficSource.isTrueDirect,
trafficSource.campaignCode,
device.browser,
device.browserVersion,
device.browserSize,
device.operatingSystem,
device.operatingSystemVersion,
device.isMobile,
device.mobileDeviceBranding,
device.mobileDeviceModel,
device.mobileInputSelector,
device.mobileDeviceInfo,
device.mobileDeviceMarketingName,
device.flashVersion,
device.javaEnabled,
device.language,
device.screenColors,
device.screenResolution,
device.deviceCategory,
geoNetwork.continent,
geoNetwork.subContinent,
geoNetwork.country,
geoNetwork.region,
geoNetwork.metro,
geoNetwork.city,
geoNetwork.cityId,
geoNetwork.networkDomain,
geoNetwork.latitude,
geoNetwork.longitude,
geoNetwork.networkLocation,
cd.index,
cd.value,
h.hitNumber,
h.time,
h.hour,
h.minute,
h.isSecure,
h.isInteraction,
h.isEntrance,
h.isExit,
h.referer,
h.page.pagePath,
h.page.hostname,
h.page.pageTitle,
h.page.searchKeyword,
h.page.searchCategory,
h.page.pagePathLevel1,
h.page.pagePathLevel2,
h.page.pagePathLevel3,
h.page.pagePathLevel4,
h.transaction.transactionId,
h.transaction.transactionRevenue as tRevenue,
h.transaction.transactionTax,
h.transaction.transactionShipping,
h.transaction.affiliation,
h.transaction.currencyCode,
h.transaction.localTransactionRevenue,
h.transaction.localTransactionTax,
h.transaction.localTransactionShipping,
h.transaction.transactionCoupon,
h.item.transactionId as tId,
h.item.productName,
h.item.productCategory,
h.item.productSku,
h.item.itemQuantity,
h.item.itemRevenue,
h.item.currencyCode as cCode,
h.item.localItemRevenue,
h.contentInfo.contentDescription,
h.appInfo.name,
h.appInfo.version,
h.appInfo.id,
h.appInfo.installerId,
h.appInfo.appInstallerId,
h.appInfo.appName,
h.appInfo.appVersion,
h.appInfo.appId,
h.appInfo.screenName,
h.appInfo.landingScreenName,
h.appInfo.exitScreenName,
h.appInfo.screenDepth,
h.exceptionInfo.description,
h.exceptionInfo.isFatal,
h.exceptionInfo.exceptions,
h.exceptionInfo.fatalExceptions,
h.eventInfo.eventCategory,
h.eventInfo.eventAction,
h.eventInfo.eventLabel,
h.eventInfo.eventValue,
hp.productSKU as pSKU,
hp.v2ProductName,
hp.v2ProductCategory,
hp.productVariant,
hp.productBrand,
hp.productRevenue,
hp.localProductRevenue,
hp.productPrice,
hp.localProductPrice,
hp.productQuantity,
hp.productRefundAmount,
hp.localProductRefundAmount,
hp.isImpression,
hp.isClick,
hpc.index as hpcIndex,
hpc.value as hpcValue,
hpCustomMetrics.index as cusomMetricsIndex,
hpCustomMetrics.value as cusomMetricsValue,
hp.productListName,
hp.productListPosition,
hp.productCouponCode,
hpromotion.promoId, 
hpromotion.promoName,
hpromotion.promoCreative,
hpromotion.promoPosition,
h.promotionActionInfo.promoIsView,
h.promotionActionInfo.promoIsClick,
h.refund.refundAmount,
h.refund.localRefundAmount,
h.eCommerceAction.action_type,
h.eCommerceAction.step,
h.eCommerceAction.option,
hExperiment.experimentId,
hExperiment.experimentVariant,
h.publisher.dfpClicks,
h.publisher.dfpImpressions,
h.publisher.dfpMatchedQueries,
h.publisher.dfpMeasurableImpressions,
h.publisher.dfpQueries,
h.publisher.dfpRevenueCpm,
h.publisher.dfpRevenueCpc,
h.publisher.dfpViewableImpressions,
h.publisher.dfpPagesViewed,
h.publisher.adsenseBackfillDfpClicks,
h.publisher.adsenseBackfillDfpImpressions,
h.publisher.adsenseBackfillDfpMatchedQueries,
h.publisher.adsenseBackfillDfpMeasurableImpressions,
h.publisher.adsenseBackfillDfpQueries,
h.publisher.adsenseBackfillDfpRevenueCpm,
h.publisher.adsenseBackfillDfpRevenueCpc,
h.publisher.adsenseBackfillDfpViewableImpressions,
h.publisher.adsenseBackfillDfpPagesViewed,
h.publisher.adxBackfillDfpClicks,
h.publisher.adxBackfillDfpImpressions,
h.publisher.adxBackfillDfpMatchedQueries,
h.publisher.adxBackfillDfpMeasurableImpressions,
h.publisher.adxBackfillDfpQueries,
h.publisher.adxBackfillDfpRevenueCpm,
h.publisher.adxBackfillDfpRevenueCpc,
h.publisher.adxBackfillDfpViewableImpressions,
h.publisher.adxBackfillDfpPagesViewed,
h.publisher.adxClicks,
h.publisher.adxImpressions,
h.publisher.adxMatchedQueries,
h.publisher.adxMeasurableImpressions,
h.publisher.adxQueries,
h.publisher.adxRevenue,
h.publisher.adxViewableImpressions,
h.publisher.adxPagesViewed,
h.publisher.adsViewed,
h.publisher.adsUnitsViewed,
h.publisher.adsUnitsMatched,
h.publisher.viewableAdsViewed,
h.publisher.measurableAdsViewed,
h.publisher.adsPagesViewed,
h.publisher.adsClicked,
h.publisher.adsRevenue,
h.publisher.dfpAdGroup,
h.publisher.dfpAdUnits,
h.publisher.dfpNetworkId,
hcustomVariables.index as hcustomVariableIndex,
hcustomVariables.customVarName,
hcustomVariables.customVarValue,
hcustomDimensions.index as customDimensionsIndex,
hcustomDimensions.value as customDimensionsvalue,
hcustomMetrics.index as hcustoMetricsIndex,
hcustomMetrics.value as hcustomMetricsValue,
h.type,
h.social.socialInteractionNetwork,
h.social.socialInteractionAction,
h.social.socialInteractions,
h.social.socialInteractionTarget,
h.social.socialNetwork,
h.social.uniqueSocialInteractions,
h.social.hasSocialSourceReferral,
h.social.socialInteractionNetworkAction,
h.latencyTracking.pageLoadSample,
h.latencyTracking.pageLoadTime,
h.latencyTracking.pageDownloadTime,
h.latencyTracking.redirectionTime,
h.latencyTracking.speedMetricsSample,
h.latencyTracking.domainLookupTime,
h.latencyTracking.serverConnectionTime,
h.latencyTracking.serverResponseTime,
h.latencyTracking.domLatencyMetricsSample,
h.latencyTracking.domInteractiveTime,
h.latencyTracking.domContentLoadedTime,
h.latencyTracking.userTimingValue,
h.latencyTracking.userTimingSample,
h.latencyTracking.userTimingVariable,
h.latencyTracking.userTimingCategory,
h.latencyTracking.userTimingLabel,
sourcePropertyInfo.sourcePropertyDisplayName,   
sourcePropertyInfo.sourcePropertyTrackingId,
h.contentGroup.contentGroup1,
h.contentGroup.contentGroup2,
h.contentGroup.contentGroup3,
h.contentGroup.contentGroup4,
h.contentGroup.contentGroup5,
h.contentGroup.previousContentGroup1,
h.contentGroup.previousContentGroup2,
h.contentGroup.previousContentGroup3,
h.contentGroup.previousContentGroup4,
h.contentGroup.previousContentGroup5,
h.contentGroup.contentGroupUniqueViews1,
h.contentGroup.contentGroupUniqueViews2,
h.contentGroup.contentGroupUniqueViews3,
h.contentGroup.contentGroupUniqueViews4,
h.contentGroup.contentGroupUniqueViews5,
h.dataSource,
hpublisher.dfpClicks as hpublisherDfpclicks,
hpublisher.dfpImpressions as hpublisherDfpImpressions,
hpublisher.dfpMatchedQueries as hpublisherDfpMatchedQueries,
hpublisher.dfpMeasurableImpressions as hpublisherDfpMeasurableImpressions,
hpublisher.dfpQueries as hpublisherDfpQueries,
hpublisher.dfpRevenueCpm as hpublisherDfpRevenueCpm,
hpublisher.dfpRevenueCpc as hpublisherDfpRevenueCpc,
hpublisher.dfpViewableImpressions as hpublisherDfpViewableImpressions,
hpublisher.dfpPagesViewed as hpublisherDfpPagesViewed,
hpublisher.adsenseBackfillDfpClicks as hpublisherAdsenseBackfillDfpClicks,
hpublisher.adsenseBackfillDfpImpressions as hpublisherAdsenseBackfillDfpImpressions,
hpublisher.adsenseBackfillDfpMatchedQueries as hpublisherAdsenseBackfillDfpMatchedQueries,
hpublisher.adsenseBackfillDfpMeasurableImpressions as hpublisherAdsenseBackfillDfpMeasurableImpressions,
hpublisher.adsenseBackfillDfpQueries as hpublisherAdsenseBackfillDfpQueries,
hpublisher.adsenseBackfillDfpRevenueCpm as hpublisherAdsenseBackfillDfpRevenueCpm,
hpublisher.adsenseBackfillDfpRevenueCpc as hpublisherAdsenseBackfillDfpRevenueCpc,
hpublisher.adsenseBackfillDfpViewableImpressions as hpublisherAdsenseBackfillDfpViewableImpressions,
hpublisher.adsenseBackfillDfpPagesViewed as hpublisherAdsenseBackfillDfpPagesViewed,
hpublisher.adxBackfillDfpClicks as hpublisherAdxBackfillDfpClicks,
hpublisher.adxBackfillDfpImpressions as hpublisherAdxBackfillDfpImpressions,
hpublisher.adxBackfillDfpMatchedQueries as hpublisherAdxBackfillDfpMatchedQueries,
hpublisher.adxBackfillDfpMeasurableImpressions as hpublisherAdxBackfillDfpMeasurableImpressions,
hpublisher.adxBackfillDfpQueries as hpublisherAdxBackfillDfpQueries,
hpublisher.adxBackfillDfpRevenueCpm as hpublisherAdxBackfillDfpRevenueCpm,
hpublisher.adxBackfillDfpRevenueCpc as hpublisherAdxBackfillDfpRevenueCpc,
hpublisher.adxBackfillDfpViewableImpressions as hpublisherAdxBackfillDfpViewableImpressions,
hpublisher.adxBackfillDfpPagesViewed as hpublisherAdxBackfillDfpPagesViewed,
hpublisher.adxClicks as hpublisherAdxClicks,
hpublisher.adxImpressions as hpublisherAdxImpressions,
hpublisher.adxMatchedQueries as hpublisherAdxMatchedQueries,
hpublisher.adxMeasurableImpressions as hpublisherAdxMeasurableImpressions,
hpublisher.adxQueries as hpublisherAdxQueries,
hpublisher.adxRevenue as hpublisherAdxRevenue,
hpublisher.adxViewableImpressions as hpublisherAdxViewableImpressions,
hpublisher.adxPagesViewed as hpublisherAdxPagesViewed,
hpublisher.adsViewed as hpublisherAdsViewed,
hpublisher.adsUnitsViewed as hpublisherAdsUnitsViewed,
hpublisher.adsUnitsMatched as hpublisherAdsUnitsMatched,
hpublisher.viewableAdsViewed as hpublisherViewableAdsViewed,
hpublisher.measurableAdsViewed as hpublisherMeasurableAdsViewed,
hpublisher.adsPagesViewed as hpublisherAdsPagesViewed,
hpublisher.adsClicked as hpublisherAdsClicked,
hpublisher.adsRevenue as hpublisherAdsRevenue,
hpublisher.dfpAdGroup as hpublisherDfpAdGroup,
hpublisher.dfpAdUnits as hpublisherDfpAdUnits,
hpublisher.dfpNetworkId as hpublisherDfpNetworkId,
fullVisitorId,
userId,
clientId,
channelGrouping,
socialEngagementType


FROM `bigquery-public-data.google_analytics_sample.ga_sessions_20170801`
LEFT JOIN UNNEST(customDimensions) as cd
LEFT JOIN UNNEST(hits) as h
LEFT JOIN UNNEST(h.product) as hp
LEFT JOIN UNNEST(hp.customDimensions) as hpc
LEFT JOIN UNNEST(hp.customMetrics) as hpCustomMetrics
LEFT JOIN UNNEST(h.promotion) as hpromotion
LEFT JOIN UNNEST(h.experiment) as hExperiment
LEFT JOIN UNNEST(h.customVariables) as hcustomVariables
LEFT JOIN UNNEST(h.customDimensions) as hcustomDimensions
LEFT JOIN UNNEST(h.customMetrics) as hcustomMetrics
LEFT JOIN UNNEST(h.publisher_infos) as hpublisher

LIMIT 100
"""

df = query_to_dataframe(query)
print(type(df))
df.to_csv("D:/PySpark/sample.csv",index=False)

, сейчас, используя вышеописанную процедуру, я получаю вложенные столбцы для многих столбцов REPEATED и RECORD.Я хочу поместить значение столбцов REPEATED в разные столбцы и то же самое для столбцов RECORD.Пожалуйста, дайте мне знать, если это возможно, используя вышеуказанный подход или есть какой-либо другой хороший подход для загрузки всех данных без потери деталей.

...