我有3个文件-kafka producer.py、consumer.py和spark-job.py。我不知道如何启动spark文件来处理来自kafka的生成数据流
在第一个终端中启动zookeeper服务器:
.\bin\windows\zookeeper-start.bat。\config\zookeeper.properties
然后在第二个单独的终端中启动kafka服务器:
.\bin\windows\kafka-server-start.bat。\config\server.properties
然后在两个单独的终端中,我启动producer.py和consumer.py
producer kafka文件只是生成一些数据字典:
{分行、货币、金额}
并每隔5秒左右将其生成到kafka群集
from json import dumps
from time import sleep
from numpy.random import choice, randint
from kafka import KafkaProducer
def get_random_value():
new_dict = {}
branch_list = ["Almaty", "Astana", "Taraz", "Semei"]
currency_list = ["KZT", "RUB", "GBP", "USD"]
new_dict['currency'] = choice(currency_list)
new_dict['amount'] = randint(1, 100)
new_dict['branch'] = choice(branch_list)
# print(new_dict)
return new_dict
if __name__ == "__main__":
producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'],
value_serializer=lambda x: dumps(x).encode('utf-8'),
compression_type='gzip')
topic_name = 'transaction'
while True:
for _ in range(100):
data = get_random_value()
try:
message = producer.send(topic=topic_name, value=data)
record_data = message.get(timeout=10)
print('data: {}, offset: {}' \
.format(data, record_data.offset))
#print(data)
except Exception as e:
print(e)
finally:
producer.flush()
sleep(5)
producer.close()
消费者只需在收到指令后打印:
from kafka import KafkaConsumer
import json
consumer = KafkaConsumer('transaction',bootstrap_servers=['127.0.0.1:9092'])
print("start consuming")
for message in consumer:
aa = json.loads(message.value.decode())
print("currency: %s, amount: %d, branch: %s" %(aa['currency'], aa['amount'], aa['branch']))
生产者、消费者工作-同时输出到终端
Spark-job.py侦听localhost:9092(kafka也位于此处),并将传入数据写入数据库
import sys
import os
import shutil
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from pyspark.streaming.kafka import KafkaUtils
import json
outputPath = 'C:/Users/Admin/Downloads/madi_kafka/logs/checkpoints01'
def get_sql_query():
strSQL = 'select from_unixtime(unix_timestamp()) as curr_time,t.branch as city,t.currency as currency,sum(amount) as amount from exchanges_stream t'
return strSQL
# -------------------------------------------------
# Lazily instantiated global instance of SparkSession
# -------------------------------------------------
def getSparkSessionInstance(sparkConf):
if ('sparkSessionSingletonInstance' not in globals()):
globals()['sparkSessionSingletonInstance'] = SparkSession \
.builder \
.config(conf=sparkConf) \
.getOrCreate()
return globals()['sparkSessionSingletonInstance']
# -------------------------------------------------
# What I want to do per each RDD...
# -------------------------------------------------
def process(time, rdd):
print("===========-----> %s <-----===========" % str(time))
try:
spark = getSparkSessionInstance(rdd.context.getConf())
rowRdd = rdd.map(lambda w: Row(city=w['branch'],
currency=w['currency'],
amount=w['amount']))
testDataFrame = spark.createDataFrame(rowRdd)
testDataFrame.createOrReplaceTempView("exchanges_stream")
sql_query = get_sql_query()
testResultDataFrame = spark.sql(sql_query)
testResultDataFrame.show(n=5)
# Insert into DB
try:
testResultDataFrame.write \
.format("jdbc") \
.mode("append") \
.option("driver", 'org.postgresql.Driver') \
.option("url", "jdbc:postgresql://xxx") \
.option("dbtable", "transaction_flow") \
.option("user", "habr") \
.option("password", "habr12345") \
.save()
print('DB write succesfull !')
except Exception as e:
print("-->Error with DB working!", e)
except Exception as e:
print("--> Error!", e)
# -------------------------------------------------
# General function
# -------------------------------------------------
def createContext():
sc = SparkContext(appName="PythonStreamingKafkaTransaction")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 10)# 2
broker_list, topic = sys.argv[1:]
try:
directKafkaStream = KafkaUtils.createDirectStream(ssc,
[topic],
{"metadata.broker.list": broker_list})
except:
raise ConnectionError("Kafka error: Connection refused: \
broker_list={} topic={}".format(broker_list, topic))
parsed_lines = directKafkaStream.map(lambda v: json.loads(v[1]))
# RDD handling
parsed_lines.foreachRDD(process)
return ssc
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: spark_job.py <zk> <topic>", file=sys.stderr)
exit(-1)
print("--> Creating new context")
if os.path.exists(outputPath):
shutil.rmtree('outputPath')
ssc = StreamingContext.getOrCreate(outputPath, lambda: createContext())
ssc.start()
ssc.awaitTermination()
我不知道如何启动spark-job.py
当制作人不断生成MSG时,我尝试启动
spark-submit \
--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2,\
org.postgresql:postgresql:9.4.1207 \
spark_job.py localhost:9092 transaction
这使得:
Exception in thread "main" org.apache.spark.SparkException: Cannot load main class from JAR org.postgresql:postgresql:9.4.1207 with URI org.postgresql. Please specify a class through --class.
如果我尝试启动此cmd:
python.exe .\spark_job.py 127.0.0.1:2181 transaction
它确实启动并创建了新的上下文,但仍然找不到某些文件:
--> Creating new context
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
20/07/25 06:12:46 WARN Checkpoint: Checkpoint directory C:/Users/Admin/Downloads/madi_kafka/logs/checkpoints01 does not exist
________________________________________________________________________________________________
Spark Streaming's Kafka libraries not found in class path. Try one of the following.
1. Include the Kafka library and its dependencies with in the
spark-submit command as
$ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8:2.4.6 ...
2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-0-8-assembly, Version = 2.4.6.
Then, include the jar in the spark-submit command as
$ bin/spark-submit --jars <spark-streaming-kafka-0-8-assembly.jar> ...
________________________________________________________________________________________________
Traceback (most recent call last):
File ".\spark_job.py", line 88, in createContext
{"metadata.broker.list": broker_list})
File "C:\python37\lib\site-packages\pyspark\streaming\kafka.py", line 138, in createDirectStream
helper = KafkaUtils._get_helper(ssc._sc)
File "C:\python37\lib\site-packages\pyspark\streaming\kafka.py", line 217, in _get_helper
return sc._jvm.org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper()
TypeError: 'JavaPackage' object is not callable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File ".\spark_job.py", line 114, in <module>
ssc = StreamingContext.getOrCreate(outputPath, lambda: createContext())
File "C:\python37\lib\site-packages\pyspark\streaming\context.py", line 107, in getOrCreate
ssc = setupFunc()
File ".\spark_job.py", line 114, in <lambda>
ssc = StreamingContext.getOrCreate(outputPath, lambda: createContext())
File ".\spark_job.py", line 91, in createContext
broker_list={} topic={}".format(broker_list, topic))
ConnectionError: Kafka error: Connection refused: broker_list=127.0.0.1:2181 topic=transaction
packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2
是正确的假定
spark-sql-kafka
)。。。我想指出的是:您的错误表明您有Spark2.4.6,Spark Streaming不推荐使用,SQL Kafka软件包可以让您省去将RDD转换为数据帧的麻烦第一个错误与缺少Postgres类有关。如前所述,我强烈建议不要在Kafka Connect exists for this very purpose时使用Spark,但要解决这个问题,需要将postgres JAR添加到包列表(或者更准确地说,是Spark类路径)
第二个错误是因为您现在通过名为
PYSPARK_SUBMIT_ARGS
的环境变量缺少packages
参数相关问题 更多 >
编程相关推荐