Vertica выбрать сцену с функцией groupby не работает - PullRequest
0 голосов
/ 16 февраля 2019

Запрос работает хорошо через vertica, но он не работает в jdbc "pyspark",

ОШИБКА: синтаксическая ошибка в или около "\"

Даже после удаления условия \ $ возвращает

"ОШИБКА: подзапрос в FROM должен иметь псевдоним"

SELECT 
 min(date(time_stamp)) mindate
,max(date(time_stamp)) maxdate
,count (distinct date(time_stamp)) noofdays
, subscriber
, server_hostname
, sum(bytes_in) DL
, sum(bytes_out) UL
, sum(connections_out) conn 
from traffic.stats 
where \$CONDITIONS 
and SUBSCRIBER like '41601%' 
and date(time_stamp) between '2019-01-25' and '2019-01-29'
and signature_service_category = 'Web Browsing' 
and (signature_service_name = 'SSL v3' 
  or signature_service_name = 'HTTP2 over TLS') 
and server_hostname not like '%.googleapis.%' 
and server_hostname not like '%.google.%' 
and server_hostname <> 'doubleclick.net'  
and server_hostname <> 'youtube.com'  
and server_hostname <> 'googleadservices.com'  
and server_hostname <> 'app-measurement.com' 
and server_hostname <> 'gstatic.com' 
and server_hostname <> 'googlesyndication.com' 
and server_hostname <> 'google-analytics.com'  
and server_hostname <> 'googleusercontent.com'  
and server_hostname <> 'ggpht.com'  
and server_hostname <> 'googletagmanager.com' 
and server_hostname is not null 
group by subscriber, server_hostname

Я пробовал вышеупомянутый запрос на pyspark1.6

from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql import SQLContext, Row
from pyspark.storagelevel import StorageLevel
from pyspark.streaming import DStream
from pyspark.streaming.dstream import TransformedDStream
from pyspark.streaming.util import TransformFunction
from pyspark.rdd import RDD
from pyspark.sql import *
from pyspark.sql import SQLContext
from datetime import datetime
from pyspark.sql.types import DateType
from dateutil.parser import parse
from datetime import timedelta
from pyspark.sql import HiveContext
import string
import re
import sys,os
import pandas




conf = (SparkConf()
.setAppName("hivereader")
.setMaster("yarn-client")
.set("spark.dynamicAllocation.enabled", "false")
.set("spark.shuffle.service.enabled", "false")
.set("spark.io.compression.codec", "snappy")
.set("spark.rdd.compress", "true")
.set("spark.executor.instances", 7)
.set("spark.executor.cores" , 7)
.set("spark.sql.inMemoryStorage.compressed", "true")
.set("spark.sql.tungsten.enabled" , 'true')
.set("spark.port.maxRetries" , 200)


)

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)



url = "jdbc:vertica*****************"
properties = {
    "user": "********",
    "password": "******",
    "driver": "com.vertica.jdbc.Driver"
}
query = "SELECT MIN(date(time_stamp)) mindate, MAX(date(time_stamp)) maxdate,COUNT (distinct date(time_stamp)) noofdays, subscriber, server_hostname, SUM(bytes_in) DL, SUM(bytes_out) UL, SUM(connections_out) conn FROM traffic.stats t WHERE SUBSCRIBER LIKE '41601%' AND date(time_stamp) between '2019-01-25' and '2019-01-29'AND signature_service_category = 'Web Browsing' AND signature_service_name IN ('SSL v3',  'HTTP2 over TLS')AND server_hostname IS NOT NULL AND server_hostname NOT LIKE '%.googleapis.%' AND server_hostname NOT LIKE '%.google.%' AND server_hostname NOT IN ( 'doubleclick.net', 'youtube.com', 'googleadservices.com', 'app-measurement.com', 'gstatic.com', 'googlesyndication.com', 'google-analytics.com', 'googleusercontent.com', 'ggpht.com', 'googletagmanager.com') GROUP BY subscriber, server_hostname"
df = sqlContext.read.format("JDBC").options(
    url = url,
    dbtable="( " + query + " ) as temp",
    **properties
).load()

df.show(50)

в конце запроса я добавил «как x», а в начале «select * from (обычный запрос)» он работал, но без показа результата, спарк 1.6 не позволяетнам использовать прямой запрос, поэтому я попытался как временная таблица db.table

...