Запрос работает хорошо через vertica, но он не работает в jdbc "pyspark",
ОШИБКА: синтаксическая ошибка в или около "\"
Даже после удаления условия \ $ возвращает
"ОШИБКА: подзапрос в FROM должен иметь псевдоним"
SELECT
min(date(time_stamp)) mindate
,max(date(time_stamp)) maxdate
,count (distinct date(time_stamp)) noofdays
, subscriber
, server_hostname
, sum(bytes_in) DL
, sum(bytes_out) UL
, sum(connections_out) conn
from traffic.stats
where \$CONDITIONS
and SUBSCRIBER like '41601%'
and date(time_stamp) between '2019-01-25' and '2019-01-29'
and signature_service_category = 'Web Browsing'
and (signature_service_name = 'SSL v3'
or signature_service_name = 'HTTP2 over TLS')
and server_hostname not like '%.googleapis.%'
and server_hostname not like '%.google.%'
and server_hostname <> 'doubleclick.net'
and server_hostname <> 'youtube.com'
and server_hostname <> 'googleadservices.com'
and server_hostname <> 'app-measurement.com'
and server_hostname <> 'gstatic.com'
and server_hostname <> 'googlesyndication.com'
and server_hostname <> 'google-analytics.com'
and server_hostname <> 'googleusercontent.com'
and server_hostname <> 'ggpht.com'
and server_hostname <> 'googletagmanager.com'
and server_hostname is not null
group by subscriber, server_hostname
Я пробовал вышеупомянутый запрос на pyspark1.6
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql import SQLContext, Row
from pyspark.storagelevel import StorageLevel
from pyspark.streaming import DStream
from pyspark.streaming.dstream import TransformedDStream
from pyspark.streaming.util import TransformFunction
from pyspark.rdd import RDD
from pyspark.sql import *
from pyspark.sql import SQLContext
from datetime import datetime
from pyspark.sql.types import DateType
from dateutil.parser import parse
from datetime import timedelta
from pyspark.sql import HiveContext
import string
import re
import sys,os
import pandas
conf = (SparkConf()
.setAppName("hivereader")
.setMaster("yarn-client")
.set("spark.dynamicAllocation.enabled", "false")
.set("spark.shuffle.service.enabled", "false")
.set("spark.io.compression.codec", "snappy")
.set("spark.rdd.compress", "true")
.set("spark.executor.instances", 7)
.set("spark.executor.cores" , 7)
.set("spark.sql.inMemoryStorage.compressed", "true")
.set("spark.sql.tungsten.enabled" , 'true')
.set("spark.port.maxRetries" , 200)
)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
url = "jdbc:vertica*****************"
properties = {
"user": "********",
"password": "******",
"driver": "com.vertica.jdbc.Driver"
}
query = "SELECT MIN(date(time_stamp)) mindate, MAX(date(time_stamp)) maxdate,COUNT (distinct date(time_stamp)) noofdays, subscriber, server_hostname, SUM(bytes_in) DL, SUM(bytes_out) UL, SUM(connections_out) conn FROM traffic.stats t WHERE SUBSCRIBER LIKE '41601%' AND date(time_stamp) between '2019-01-25' and '2019-01-29'AND signature_service_category = 'Web Browsing' AND signature_service_name IN ('SSL v3', 'HTTP2 over TLS')AND server_hostname IS NOT NULL AND server_hostname NOT LIKE '%.googleapis.%' AND server_hostname NOT LIKE '%.google.%' AND server_hostname NOT IN ( 'doubleclick.net', 'youtube.com', 'googleadservices.com', 'app-measurement.com', 'gstatic.com', 'googlesyndication.com', 'google-analytics.com', 'googleusercontent.com', 'ggpht.com', 'googletagmanager.com') GROUP BY subscriber, server_hostname"
df = sqlContext.read.format("JDBC").options(
url = url,
dbtable="( " + query + " ) as temp",
**properties
).load()
df.show(50)
в конце запроса я добавил «как x», а в начале «select * from (обычный запрос)» он работал, но без показа результата, спарк 1.6 не позволяетнам использовать прямой запрос, поэтому я попытался как временная таблица db.table