Assuming I run a python shell (file1.py) which take a text file as a parameter. that I run it as the following:
python file1.py textfile1.txt
Inside file1.py the following code
from pyspark import SparkContext
....
#I can read the file using the follwoing command
sc = SparkContext()
inputfile= sc.textFile(sys.argv[1])
What is the required modifications I must do to make file1.py run without problems?
But pyspark doesn't work with me, usually, I am using spark-submit!so it give me the following error when run using spark-submit in local mode
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.6/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.6/http/client.py", line 71, in <module>
import email.parser
File "/usr/lib/python3.6/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.6/email/feedparser.py", line 27, in <module>
from email._policybase import compat32
File "/usr/lib/python3.6/email/_policybase.py", line 9, in <module>
from email.utils import _has_surrogates
File "/usr/lib/python3.6/email/utils.py", line 31, in <module>
import urllib.parse
File "/usr/lib/python3.6/urllib/parse.py", line 227, in <module>
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Original exception was:
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
hduser@noorhadoop-virtual-machine:/usr/local/spark$ ./bin/spark-submit --master local[3] /home/noorhadoop/Desktop/folder1/file1.py /home/noorhadoop/Desktop/folder1/simple1.txt
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Error in sys.excepthook:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in apport_excepthook
from apport.fileutils import likely_packaged, get_recent_crashes
File "/usr/lib/python3/dist-packages/apport/__init__.py", line 5, in <module>
from apport.report import Report
File "/usr/lib/python3/dist-packages/apport/report.py", line 21, in <module>
from urllib.request import urlopen
File "/usr/lib/python3.6/urllib/request.py", line 88, in <module>
import http.client
File "/usr/lib/python3.6/http/client.py", line 71, in <module>
import email.parser
File "/usr/lib/python3.6/email/parser.py", line 12, in <module>
from email.feedparser import FeedParser, BytesFeedParser
File "/usr/lib/python3.6/email/feedparser.py", line 27, in <module>
from email._policybase import compat32
File "/usr/lib/python3.6/email/_policybase.py", line 9, in <module>
from email.utils import _has_surrogates
File "/usr/lib/python3.6/email/utils.py", line 31, in <module>
import urllib.parse
File "/usr/lib/python3.6/urllib/parse.py", line 227, in <module>
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Original exception was:
Traceback (most recent call last):
File "/home/noorhadoop/Desktop/folder1/file1.py", line 4, in <module>
from pyspark import SparkContext
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/__init__.py", line 44, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/context.py", line 33, in <module>
File "<frozen importlib._bootstrap>", line 961, in _find_and_load
File "<frozen importlib._bootstrap>", line 950, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 646, in _load_unlocked
File "<frozen importlib._bootstrap>", line 616, in _load_backward_compatible
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/java_gateway.py", line 25, in <module>
File "/usr/lib/python3.6/platform.py", line 909, in <module>
"system node release version machine processor")
File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 381, in namedtuple
TypeError: namedtuple() missing 3 required keyword-only arguments: 'verbose', 'rename', and 'module'
Thanks,
You didnt post error messages, so it hard to know exact but sc.textFile
expects full path of file either on HDFS or local file system.
for example, if you are running spark in local mode , you will have to pass arguments with spark-submit as -
spark-submit \
--master local[*] \
--/path/to/file1.py \
"file://path/to/textfile1.txt"
or if you are running on cluster, give full hdfs path as argument
spark-submit \
--master spark://localhost:7077 \
--/path/to/file1.py \
"hdfs://localhost:9000/path/to/textfile1.txt"