Я исправил это.
Как говорит @QuickSilver, каждая функция должна быть определена перед использованием. Кроме того, рамка Dynami c должна быть написана, как показано ниже. Функция filter_sex, для которой она вызывается, не обязательно должна иметь параметр.
filter3frame = Filter.apply(frame=datasource0, f=filter_sex)
Итак, окончательный рабочий код выглядит следующим образом -
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
def filter_sex(item):
if item['sex'] == 'Male':
return True
else:
return False
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "amssurvey", table_name = "amssurvey", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "amssurvey", table_name = "amssurvey", transformation_ctx = "datasource0")
## @type: ApplyMapping
## @args: [mapping = [("nomber", "long", "nomber", "long"), ("type", "string", "type", "string"), ("sex", "string", "sex", "string"), ("citizen", "string", "citizen", "string"), ("count", "long", "count", "long"), ("countstate", "long", "countstate", "long")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
filter1frame = Filter.apply(frame=datasource0, f=lambda x:x['citizen'] in ["US"])
filter2frame = Filter.apply(frame=datasource0, f=lambda x:x['count'] > 50)
filter3frame = Filter.apply(frame=datasource0, f=filter_sex)
filter1_op = glueContext.write_dynamic_frame.from_options(frame = filter1frame, connection_type = "s3", connection_options = {"path": "s3://asgqatestautomation3/SourceFiles/filter1_op"}, format = "csv", transformation_ctx = "filter1_op")
filter2_op = glueContext.write_dynamic_frame.from_options(frame = filter2frame, connection_type = "s3", connection_options = {"path": "s3://asgqatestautomation3/SourceFiles/filter2_op"}, format = "csv", transformation_ctx = "filter2_op")
filter3_op = glueContext.write_dynamic_frame.from_options(frame = filter3frame, connection_type = "s3", connection_options = {"path": "s3://asgqatestautomation3/SourceFiles/filter3_op"}, format = "csv", transformation_ctx = "filter3_op")
job.commit()