Skip to content

Instantly share code, notes, and snippets.

@stephanie-wang
Created February 16, 2021 19:34
Show Gist options
  • Save stephanie-wang/7400fd3b50bef0c8ce2d5b29f58c2530 to your computer and use it in GitHub Desktop.
Save stephanie-wang/7400fd3b50bef0c8ce2d5b29f58c2530 to your computer and use it in GitHub Desktop.
Data processing support in Ray
import ray
from ray.util.dask import ray_dask_get
import dask
import dask.dataframe as dd
import pandas as pd
import numpy as np
dask.config.set(scheduler=ray_dask_get) # Sets Ray as the default backend.
ray.init()
df = pd.DataFrame(np.random.randint(0, 100, size=(2**10, 2**8)))
df = dd.from_pandas(df, npartitions=10)
print(df.head(10))
from mars.session import new_session
ray_session = new_session(backend=’ray’).as_default() # Set Ray as the default backend.
import mars.dataframe as md
import mars.tensor as mt
t = mt.random.randint(100, size=(2**10, 2**8))
df = md.DataFrame(t)
print(df.head(10).execute())
import ray
# Modin defaults to backing Ray’s object store with disk.
# Start Ray before importing modin to use shared memory instead.
ray.init()
import modin.pandas as pd
import numpy as np
frame_data = np.random.randint(0, 100, size=(2**10, 2**8))
df = pd.DataFrame(frame_data)
print(df.head(10))
import ray
import raydp
ray.init()
@ray.remote
class PySparkDriver:
def __init__(self):
self.spark = raydp.init_spark(
app_name=”RayDP example”,
num_executors=2,
executor_cores=2,
executor_memory=”4GB”)
def foo(self):
return self.spark.range(1000).repartition(10).count()
driver = PySparkDriver.remote()
print(ray.get(driver.foo.remote()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment