Skip to content

Commit

Permalink
removed spark RDD and geni (#455)
Browse files Browse the repository at this point in the history
  • Loading branch information
behrica authored Feb 18, 2025
1 parent 1cc7d1d commit 4afcf93
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 107 deletions.
10 changes: 1 addition & 9 deletions deps.edn
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,7 @@
org.slf4j/slf4j-api]}
org.lz4/lz4-java {:mvn/version "1.8.0"}
com.cnuernber/jarrow {:mvn/version "1.000"}
zero.one/geni {:mvn/version "0.0.34"
:exclusions [commons-codec/commons-codec]}
org.apache.spark/spark-avro_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-core_2.12 {:mvn/version "3.0.1"
:exclusions [org.slf4j/slf4j-log4j12]}
org.apache.spark/spark-hive_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-mllib_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-sql_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-streaming_2.12 {:mvn/version "3.0.1"}

org.tribuo/tribuo-all {:mvn/version "4.3.1" :extension "pom"}
}
:extra-paths ["neanderthal" "test"]}
Expand Down
65 changes: 0 additions & 65 deletions java/tech/v3/dataset/SimpleRDD.java

This file was deleted.

57 changes: 24 additions & 33 deletions src/tech/v3/libs/spark.clj
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
[org.apache.spark.sql.types StructType StructField
DataTypes DataType]
[tech.v3.datatype ObjectReader]
[tech.v3.dataset SimpleRDD]
[java.time LocalDate Instant]
[java.util List]))

Expand Down Expand Up @@ -167,40 +166,8 @@
(ds->spark-dataset ds session nil)))


(defn default-ds-fn
[src]
(-> (ds-io/->dataset src)
(prepare-ds-for-spark)
(dataset->row-list)))


(defn ds-src-data->rdd
"Given a session, a full namespaced name that resolves to an IFn,
and a list of serializable data produce an RDD."
(^Dataset [^SparkSession spark-session
^String ds-fn-name
ds-src-data]
(SimpleRDD. (.sparkContext spark-session)
(vec ds-src-data)
ds-fn-name)))


(comment
(require '[zero-one.geni.core :as g])
(require '[zero-one.geni.defaults :as geni-defaults])
(def dataframe (g/read-csv! "test/data/stocks.csv"))
(require '[tech.v3.dataset :as ds])
(def stocks (ds/->dataset "test/data/stocks.csv"))
(def session @geni-defaults/spark)
(def schema (-> (ds/->dataset "test/data/stocks.csv")
(prepare-ds-for-spark)
(ds-schema)))
(def rdd (ds-src-data->dataset @geni-defaults/spark
schema
"tech.v3.libs.spark/default-ds-fn"
[[{:a 1} {:a 2}]]))
)


(defn collect-spark-dataset->ds
[^Dataset dataset]
Expand All @@ -227,3 +194,27 @@
nil
[]))))
(ds-impl/new-dataset))))



(comment
;; databricks-connect specific classes
;; should work similar for spark-connect

;;Tested with hese deps
;;org.scala-lang/scala-reflect {:mvn/version "2.12.18"}
;;com.databricks/databricks-connect {:mvn/version "16.1.0"}


(import
'[com.databricks.connect DatabricksSession]
'[com.databricks.sdk.core DatabricksConfig])

(def config (.. (DatabricksConfig.) (setProfile "adb-xxxxx")))
(def spark (.. (DatabricksSession/builder) (sdkConfig config) getOrCreate))

(->
(.sql spark "show catalogs;")
collect-spark-dataset->ds)

)

0 comments on commit 4afcf93

Please sign in to comment.