-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzip_csv_data_source.py
More file actions
104 lines (80 loc) · 3.12 KB
/
zip_csv_data_source.py
File metadata and controls
104 lines (80 loc) · 3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from pathlib import Path
from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
from pyspark.sql.types import StructType
class RangePartition(InputPartition):
def __init__(self, start, end):
self.start = start
self.end = end
class ZipDataSourceReader(DataSourceReader):
def __init__(self, schema, options):
self.schema: StructType = schema
self.options = options
self.path = self.options.get("path", None)
self.numPartitions = int(self.options.get("numPartitions", 2))
print(options)
def partitions(self):
return [RangePartition(0, 1000) for i in range(self.numPartitions)]
def read(self, partition):
# Library imports must be within the method.
from zipfile import ZipFile
print(partition)
try:
p = Path(self.path)
if not p.exists():
print(f"not exists {p}")
return
if p.is_dir():
# a folder full of zips
# beaware of .glob() at extreme
for file in Path(self.path).glob("**/*.zip"):
print(f"{file}")
with ZipFile(file, "r") as zipFile:
for name in zipFile.namelist():
with zipFile.open(name, "r") as zipfile:
for line in zipfile:
yield [f"{file}, {name} {line.decode('utf-8')}"]
else:
# single zip file
with ZipFile(file, "r") as zipFile:
for name in zipFile.namelist():
with zipFile.open(name, "r") as zipfile:
for line in zipfile:
print(type(line))
yield [f"{file}, {name} {line.decode('utf-8')}"]
except Exception as e:
print(e)
from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql.types import StructType
class ZipDataSource(DataSource):
"""
An example data source for batch query using the `Zipr` library.
"""
@classmethod
def name(cls):
return "Zip"
def schema(self):
return "line string"
def reader(self, schema: StructType):
return ZipDataSourceReader(schema, self.options)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
def create_spark_session(app_name="PySpark CSV Zips Datasource Tester"):
"""
Creates and returns a Spark Session
"""
return (
SparkSession.builder.appName(app_name)
.master("local[*]")
.config("spark.memory.offHeap.enabled", "true")
.config("spark.memory.offHeap.size", "1g")
.getOrCreate()
)
if __name__ == "__main__":
print("Hello")
spark = create_spark_session()
spark.dataSource.register(ZipDataSource)
# TODO: Fix re-create CSV data
# df = spark.read.format("Zip").load("./zips/x.zip").limit(3)
# print(df.collect())
df = spark.read.format("Zip").load("./zips")
df.write.format("text").mode("overwrite").save("./saves")