feldera/python/feldera/pipeline.py at main · feldera/feldera

History

1560 lines (1230 loc) · 55.9 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

import logging

import pathlib

import time

from collections import deque

from datetime import datetime

from threading import Event

from typing import Any, Callable, Dict, Generator, List, Mapping, Optional

from uuid import UUID

import pandas

from feldera._callback_runner import CallbackRunner

from feldera._helpers import chunk_dataframe, ensure_dataframe_has_columns

from feldera.enums import (

BootstrapPolicy,

CheckpointStatus,

CompletionTokenStatus,

DeploymentDesiredStatus,

DeploymentResourcesDesiredStatus,

DeploymentResourcesStatus,

DeploymentRuntimeDesiredStatus,

DeploymentRuntimeStatus,

PipelineFieldSelector,

PipelineStatus,

ProgramStatus,

StorageStatus,

TransactionStatus,

)

from feldera.output_handler import OutputHandler

from feldera.rest.errors import FelderaAPIError

from feldera.rest.feldera_client import FelderaClient

from feldera.rest.pipeline import Pipeline as InnerPipeline

from feldera.rest.sql_table import SQLTable

from feldera.rest.sql_view import SQLView

from feldera.runtime_config import RuntimeConfig

from feldera.stats import PipelineStatistics

from feldera.types import CheckpointMetadata

class Pipeline:

def __init__(self, client: FelderaClient):

self.client: FelderaClient = client

self._inner: InnerPipeline | None = None

@staticmethod

def _from_inner(inner: InnerPipeline, client: FelderaClient) -> "Pipeline":

pipeline = Pipeline(client)

pipeline._inner = inner

return pipeline

def refresh(self, field_selector: PipelineFieldSelector):

"""

Calls the backend to get the updated, latest version of the pipeline.

:param field_selector: Choose what pipeline information to refresh; see PipelineFieldSelector enum definition.

:raises FelderaConnectionError: If there is an issue connecting to the backend.

"""

self._inner = self.client.get_pipeline(self.name, field_selector)

def status(self) -> PipelineStatus:

"""

Return the current status of the pipeline.

"""

try:

self.refresh(PipelineFieldSelector.STATUS)

return PipelineStatus.from_str(self._inner.deployment_status)

except FelderaAPIError as err:

if err.status_code == 404:

return PipelineStatus.NOT_FOUND

else:

raise err

def wait_for_status(

self, expected_status: PipelineStatus, timeout: Optional[int] = None

) -> None:

"""

Wait for the pipeline to reach the specified status.

:param expected_status: The status to wait for

:param timeout: Maximum time to wait in seconds. If None, waits forever (default: None)

:raises TimeoutError: If the expected status is not reached within the timeout

"""

start_time = time.monotonic()

while True:

current_status = self.status()

if current_status == expected_status:

return

if timeout is not None and time.monotonic() - start_time >= timeout:

raise TimeoutError(

f"Pipeline did not reach {expected_status.name} status within {timeout} seconds"

)

time.sleep(1)

def stats(self) -> PipelineStatistics:

"""Gets the pipeline metrics and performance counters."""

return PipelineStatistics.from_dict(self.client.get_pipeline_stats(self.name))

def logs(self) -> Generator[str, None, None]:

"""Gets the pipeline logs."""

return self.client.get_pipeline_logs(self.name)

def input_pandas(self, table_name: str, df: pandas.DataFrame, force: bool = False):

"""

Push all rows in a pandas DataFrame to the pipeline.

The pipeline must either be in RUNNING or PAUSED states to push data.

An error will be raised if the pipeline is in any other state.

The dataframe must have the same columns as the table in the pipeline.

:param table_name: The name of the table to insert data into.

:param df: The pandas DataFrame to be pushed to the pipeline.

:param force: `True` to push data even if the pipeline is paused. `False` by default.

:raises ValueError: If the table does not exist in the pipeline.

:raises RuntimeError: If the pipeline is not in a valid state to push data.

:raises RuntimeError: If the pipeline is paused and force is not set to `True`.

"""

status = self.status()

if status not in [

PipelineStatus.RUNNING,

PipelineStatus.PAUSED,

raise RuntimeError("Pipeline must be running or paused to push data")

if not force and status == PipelineStatus.PAUSED:

raise RuntimeError("Pipeline is paused, set force=True to push data")

ensure_dataframe_has_columns(df)

pipeline = self.client.get_pipeline(self.name, PipelineFieldSelector.ALL)

if table_name.lower() != "now" and table_name.lower() not in [

tbl.name.lower() for tbl in pipeline.tables

raise ValueError(

f"Cannot push to table '{table_name}': table with this name"

f" does not exist in the '{self.name}' pipeline"

)

else:

# consider validating the schema here

for datum in chunk_dataframe(df):

self.client.push_to_pipeline(

self.name,

table_name,

"json",

datum.to_json(orient="records", date_format="epoch"),

json_flavor="pandas",

array=True,

serialize=False,

force=force,

)

return

def input_json(

self,

table_name: str,

data: Dict | list,

update_format: str = "raw",

force: bool = False,

wait: bool = True,

"""

Push this JSON data to the specified table of the pipeline.

The pipeline must either be in RUNNING or PAUSED states to push data.

An error will be raised if the pipeline is in any other state.

:param table_name: The name of the table to push data into.

:param data: The JSON encoded data to be pushed to the pipeline. The data should be in the form:

`{'col1': 'val1', 'col2': 'val2'}` or `[{'col1': 'val1', 'col2': 'val2'}, {'col1': 'val1', 'col2': 'val2'}]`

:param update_format: The update format of the JSON data to be pushed to the pipeline. Must be one of:

"raw", "insert_delete". https://docs.feldera.com/formats/json#the-insertdelete-format

:param force: `True` to push data even if the pipeline is paused. `False` by default.

:param wait: If True, blocks until this input has been processed by the pipeline

:raises ValueError: If the update format is invalid.

:raises FelderaAPIError: If the pipeline is not in a valid state to push data.

:raises RuntimeError: If the pipeline is paused and `force` is not set to `True`.

"""

status = self.status()

if not force and status == PipelineStatus.PAUSED:

raise RuntimeError("Pipeline is paused, set force=True to push data")

if update_format not in ["raw", "insert_delete"]:

raise ValueError("update_format must be one of raw or insert_delete")

array = True if isinstance(data, list) else False

self.client.push_to_pipeline(

self.name,

table_name,

"json",

data,

update_format=update_format,

array=array,

force=force,

wait=wait,

)

def pause_connector(self, table_name: str, connector_name: str):

"""

Pause the specified input connector.

Connectors allow feldera to fetch data from a source or write data to a sink.

This method allows users to **PAUSE** a specific **INPUT** connector.

All connectors are RUNNING by default.

Refer to the connector documentation for more information:

https://docs.feldera.com/connectors/#input-connector-orchestration

:param table_name: The name of the table that the connector is attached to.

:param connector_name: The name of the connector to pause.

:raises FelderaAPIError: If the connector is not found, or if the pipeline is not running.

"""

self.client.pause_connector(self.name, table_name, connector_name)

def resume_connector(self, table_name: str, connector_name: str):

"""

Resume the specified connector.

Connectors allow feldera to fetch data from a source or write data to a sink.

This method allows users to **RESUME / START** a specific **INPUT** connector.

All connectors are RUNNING by default.

Refer to the connector documentation for more information:

https://docs.feldera.com/connectors/#input-connector-orchestration

:param table_name: The name of the table that the connector is attached to.

:param connector_name: The name of the connector to resume.

:raises FelderaAPIError: If the connector is not found, or if the pipeline is not running.

"""

self.client.resume_connector(self.name, table_name, connector_name)

def listen(self, view_name: str) -> OutputHandler:

"""

Follow the change stream (i.e., the output) of the provided view.

Returns an output handle to read the changes.

When the pipeline is stopped, the handle is dropped.

The handle will only receive changes from the point in time when the listener is created.

In order to receive all changes since the pipeline started, you can create the pipeline in the `PAUSED` state

using :meth:`start_paused`, attach listeners and unpause the pipeline using :meth:`resume`.

:param view_name: The name of the view to listen to.

"""

if self.status() not in [PipelineStatus.PAUSED, PipelineStatus.RUNNING]:

raise RuntimeError("Pipeline must be running or paused to listen to output")

handler = OutputHandler(self.client, self.name, view_name)

handler.start()

return handler

def foreach_chunk(

self, view_name: str, callback: Callable[[pandas.DataFrame, int], None]

"""

Run the given callback on each chunk of the output of the specified view.

The callback will only receive changes from the point in time when the listener is created.

In order to receive all changes since the pipeline started, you can create the pipeline in the `PAUSED` state

using :meth:`start_paused`, attach listeners and unpause the pipeline using :meth:`resume`.

:param view_name: The name of the view.

:param callback: The callback to run on each chunk. The callback should take two arguments:

- **chunk** -> The chunk as a pandas DataFrame

- **seq_no** -> The sequence number. The sequence number is a monotonically increasing integer that

starts from 0. Note that the sequence number is unique for each chunk, but not necessarily contiguous.

Please note that the callback is run in a separate thread, so it should be thread-safe.

Please note that the callback should not block for a long time, as by default, backpressure is enabled and

will block the pipeline.

.. note::

- The callback must be thread-safe as it will be run in a separate thread.

"""

if self.status() not in [PipelineStatus.RUNNING, PipelineStatus.PAUSED]:

raise RuntimeError("Pipeline must be running or paused to listen to output")

event = Event()

handler = CallbackRunner(

self.client, self.name, view_name, callback, lambda exception: None, event

)

handler.start()

event.wait()

def wait_for_completion(

self, force_stop: bool = False, timeout_s: float | None = None

"""

Block until the pipeline has completed processing all input records.

This method blocks until (1) all input connectors attached to the

pipeline have finished reading their input data sources and issued

end-of-input notifications to the pipeline, and (2) all inputs received

from these connectors have been fully processed and corresponding

outputs have been sent out through the output connectors.

This method will block indefinitely if at least one of the input

connectors attached to the pipeline is a streaming connector, such as

Kafka, that does not issue the end-of-input notification.

:param force_stop: If True, the pipeline will be forcibly stopped after

completion. False by default. No checkpoints will be made.

:param timeout_s: Optional. The maximum time (in seconds) to wait for

the pipeline to complete. The default is None, which means wait

indefinitely.

:raises RuntimeError: If the pipeline returns unknown metrics.

"""

if self.status() not in [

PipelineStatus.RUNNING,

PipelineStatus.INITIALIZING,

PipelineStatus.PROVISIONING,

PipelineStatus.BOOTSTRAPPING,

raise RuntimeError("Pipeline must be running to wait for completion")

start_time = time.monotonic()

while True:

if timeout_s is not None:

elapsed = time.monotonic() - start_time

if elapsed > timeout_s:

raise TimeoutError(

f"timeout ({timeout_s}s) reached while waiting for"

f" pipeline '{self.name}' to complete"

)

logging.debug(

f"waiting for pipeline {self.name} to complete: elapsed"

f" time {elapsed}s, timeout: {timeout_s}s"

)

pipeline_complete: bool = self.is_complete()

if pipeline_complete is None:

raise RuntimeError(

"received unknown metrics from the pipeline, pipeline_complete is None"

)

elif pipeline_complete:

break

time.sleep(1)

if force_stop:

self.stop(force=True)

def is_complete(self) -> bool:

"""

Check if the pipeline has completed processing all input records.

Returns True if (1) all input connectors attached to the

pipeline have finished reading their input data sources and issued

end-of-input notifications to the pipeline, and (2) all inputs received

from these connectors have been fully processed and corresponding

outputs have been sent out through the output connectors.

"""

return self.stats().global_metrics.pipeline_complete

def wait_for_idle(

self,

idle_interval_s: float = 5.0,

timeout_s: float | None = None,

poll_interval_s: float = 0.2,

"""

Wait for the pipeline to become idle and then returns.

Idle is defined as a sufficiently long interval in which the number of

input and processed records reported by the pipeline do not change, and

they equal each other (thus, all input records present at the pipeline

have been processed).

:param idle_interval_s: Idle interval duration (default is 5.0 seconds).

:param timeout_s: Timeout waiting for idle (`None` = no timeout is enforced).

:param poll_interval_s: Polling interval, should be set substantially

smaller than the idle interval (default is 0.2 seconds).

:raises ValueError: If idle interval is larger than timeout, poll interval

is larger than timeout, or poll interval is larger than idle interval.

:raises RuntimeError: If the metrics are missing or the timeout was

reached.

"""

if timeout_s is not None and idle_interval_s > timeout_s:

raise ValueError(

f"idle interval ({idle_interval_s}s) cannot be larger than"

f" timeout ({timeout_s}s)"

)

if timeout_s is not None and poll_interval_s > timeout_s:

raise ValueError(

f"poll interval ({poll_interval_s}s) cannot be larger than"

f" timeout ({timeout_s}s)"

)

if poll_interval_s > idle_interval_s:

raise ValueError(

f"poll interval ({poll_interval_s}s) cannot be larger "

f"than idle interval ({idle_interval_s}s)"

)

start_time_s = time.monotonic()

idle_started_s = None

prev = (0, 0)

while True:

now_s = time.monotonic()

# Metrics retrieval

metrics = self.stats().global_metrics

total_input_records = metrics.total_input_records

total_processed_records = metrics.total_processed_records

if metrics.total_input_records is None:

raise RuntimeError(

"total_input_records is missing from the pipeline metrics"

)

if metrics.total_processed_records is None:

raise RuntimeError(

"""total_processed_records is missing from the pipeline \

metrics"""

)

# Idle check

unchanged = (

prev[0] == total_input_records and prev[1] == total_processed_records

)

equal = total_input_records == total_processed_records

prev = (total_input_records, total_processed_records)

if unchanged and equal:

if idle_started_s is None:

idle_started_s = now_s

else:

idle_started_s = None

if idle_started_s is not None and now_s - idle_started_s >= idle_interval_s:

return

# Timeout

if timeout_s is not None and now_s - start_time_s >= timeout_s:

raise RuntimeError(f"waiting for idle reached timeout ({timeout_s}s)")

time.sleep(poll_interval_s)

def activate(

self, wait: bool = True, timeout_s: Optional[float] = None

) -> Optional[PipelineStatus]:

"""

Activates the pipeline when starting from STANDBY mode. Only applicable

when the pipeline is starting from a checkpoint in object store.

:param wait: Set True to wait for the pipeline to activate. True by

default

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to pause.

"""

return self.client.activate_pipeline(self.name, wait=wait, timeout_s=timeout_s)

def start(

self,

bootstrap_policy: Optional[BootstrapPolicy] = None,

wait: bool = True,

timeout_s: Optional[float] = None,

dismiss_error: bool = True,

"""

.. _start:

Starts this pipeline.

- The pipeline must be in STOPPED state to start.

- If the pipeline is in any other state, an error will be raised.

- If the pipeline is in PAUSED state, use `.meth:resume` instead.

:param bootstrap_policy: The bootstrap policy to use.

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to start.

:param wait: Set True to wait for the pipeline to start. True by default

:param dismiss_error: Set True to dismiss any deployment error before starting;

set False to make it fail in that case. True by default.

:raises RuntimeError: If the pipeline is not in STOPPED state.

"""

self.client.start_pipeline(

self.name,

bootstrap_policy=bootstrap_policy,

wait=wait,

timeout_s=timeout_s,

dismiss_error=dismiss_error,

)

def start_paused(

self,

bootstrap_policy: Optional[BootstrapPolicy] = None,

wait: bool = True,

timeout_s: Optional[float] = None,

dismiss_error: bool = True,

"""

Starts the pipeline in the paused state.

:param bootstrap_policy: The bootstrap policy to use.

:param wait: Set True to wait for the pipeline to start. True by default.

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to start (defaults to `None` = no timeout is enforced).

:param dismiss_error: Set True to dismiss any deployment error before starting;

set False to make it fail in that case. True by default.

"""

return self.client.start_pipeline_as_paused(

self.name,

bootstrap_policy=bootstrap_policy,

wait=wait,

timeout_s=timeout_s,

dismiss_error=dismiss_error,

)

def start_standby(

self,

bootstrap_policy: Optional[BootstrapPolicy] = None,

wait: bool = True,

timeout_s: Optional[float] = None,

dismiss_error: bool = True,

"""

Starts the pipeline in the standby state.

:param bootstrap_policy: The bootstrap policy to use.

:param wait: Set True to wait for the pipeline to start. True by default.

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to start (defaults to `None` = no timeout is enforced).

:param dismiss_error: Set True to dismiss any deployment error before starting;

set False to make it fail in that case. True by default.

"""

self.client.start_pipeline_as_standby(

self.name,

bootstrap_policy=bootstrap_policy,

wait=wait,

timeout_s=timeout_s,

dismiss_error=dismiss_error,

)

def restart(

self,

bootstrap_policy: Optional[BootstrapPolicy] = None,

timeout_s: Optional[float] = None,

dismiss_error: bool = True,

"""

Restarts the pipeline.

This method forcibly **STOPS** the pipeline regardless of its current

state and then starts it again. No checkpoints are made when stopping

the pipeline.

:param bootstrap_policy: The bootstrap policy to use.

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to restart.

:param dismiss_error: Set True to dismiss any deployment error before starting;

set False to make it fail in that case. True by default.

"""

self.stop(force=True, timeout_s=timeout_s)

self.start(

bootstrap_policy=bootstrap_policy,

timeout_s=timeout_s,

dismiss_error=dismiss_error,

)

def pause(self, wait: bool = True, timeout_s: Optional[float] = None):

"""

Pause the pipeline.

The pipeline can only transition to the PAUSED state from the RUNNING

state. If the pipeline is already paused, it will remain in the PAUSED

state.

:param wait: Set True to wait for the pipeline to pause. True by default

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to pause.

"""

self.client.pause_pipeline(self.name, wait=wait, timeout_s=timeout_s)

def stop(self, force: bool, wait: bool = True, timeout_s: Optional[float] = None):

"""

Stops the pipeline.

Stops the pipeline regardless of its current state.

:param force: Set True to immediately scale compute resources to zero.

Set False to automatically checkpoint before stopping.

:param wait: Set True to gracefully shutdown listeners and wait for the

pipeline to stop. True by default.

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to stop.

"""

self.client.stop_pipeline(

self.name, force=force, wait=wait, timeout_s=timeout_s

)

def dismiss_error(self):

"""

Dismisses the `deployment_error` of the pipeline.

"""

self.client.dismiss_error_pipeline(self.name)

def approve(self):

"""

Approves the pipeline to proceed with bootstrapping.

This method is used when a pipeline has been started with

`bootstrap_policy=BootstrapPolicy.AWAIT_APPROVAL` and is currently in the

AWAITINGAPPROVAL state. The pipeline will wait for explicit user approval

before proceeding with the bootstrapping process.

"""

self.client.approve_pipeline(self.name)

def resume(self, wait: bool = True, timeout_s: Optional[float] = None):

"""

Resumes the pipeline from the PAUSED state. If the pipeline is already

running, it will remain in the RUNNING state.

:param wait: Set True to wait for the pipeline to resume. True by default

:param timeout_s: The maximum time (in seconds) to wait for the

pipeline to resume.

"""

self.client.resume_pipeline(self.name, wait=wait, timeout_s=timeout_s)

def start_transaction(self) -> int:

"""

Start a new transaction.

:return: Transaction ID.

:raises FelderaAPIError: If the pipeline fails to start a transaction, e.g., if the pipeline is not running or

there is already an active transaction.

"""

return self.client.start_transaction(self.name)

def commit_transaction(

self,

transaction_id: Optional[int] = None,

wait: bool = True,

timeout_s: Optional[float] = None,

"""

Commit the currently active transaction.

:param transaction_id: If provided, the function verifies that the currently active transaction matches this ID.

If the active transaction ID does not match, the function raises an error.

:param wait: If True, the function blocks until the transaction either commits successfully or the timeout is reached.

If False, the function initiates the commit and returns immediately without waiting for completion. The default value is True.

:param timeout_s: Maximum time (in seconds) to wait for the transaction to commit when `wait` is True.

If None, the function will wait indefinitely.

:raises RuntimeError: If there is currently no transaction in progress.

:raises ValueError: If the provided `transaction_id` does not match the current transaction.

:raises TimeoutError: If the transaction does not commit within the specified timeout (when `wait` is True).

:raises FelderaAPIError: If the pipeline fails to commit a transaction.

"""

self.client.commit_transaction(self.name, transaction_id, wait, timeout_s)

def transaction_status(self) -> TransactionStatus:

"""

Get pipeline's transaction handling status.

:return: Current transaction handling status of the pipeline.

:raises FelderaAPIError: If pipeline's status couldn't be read, e.g., because the pipeline is not currently running.

"""

return self.stats().global_metrics.transaction_status

def transaction_id(self) -> Optional[int]:

"""

Gets the ID of the currently active transaction or None if there is no active transaction.

:return: The ID of the transaction.

"""

transaction_id = self.stats().global_metrics.transaction_id

if transaction_id == 0:

return None

else:

return transaction_id

def delete(self, clear_storage: bool = False):

"""

Deletes the pipeline.

The pipeline must be stopped, and the storage cleared before it can be

deleted.

:param clear_storage: True if the storage should be cleared before

deletion. False by default

:raises FelderaAPIError: If the pipeline is not in STOPPED state or the

storage is still bound.

"""

if clear_storage:

self.clear_storage()

self.client.delete_pipeline(self.name)

@staticmethod

def get(name: str, client: FelderaClient) -> "Pipeline":

"""

Get the pipeline if it exists.

:param name: The name of the pipeline.

:param client: The FelderaClient instance.

"""

try:

inner = client.get_pipeline(name, PipelineFieldSelector.ALL)

return Pipeline._from_inner(inner, client)

except FelderaAPIError as err:

if err.status_code == 404:

err.message = f"Pipeline with name {name} not found"

raise err

@staticmethod

def all(client: FelderaClient) -> List["Pipeline"]:

"""

Get all pipelines.

:param client: The FelderaClient instance.

:return: A list of Pipeline objects.

"""

return [Pipeline._from_inner(p, client) for p in client.pipelines()]

def checkpoint(self, wait: bool = False, timeout_s: Optional[float] = None) -> int:

"""

Checkpoints this pipeline.

:param wait: If true, will block until the checkpoint completes.

:param timeout_s: The maximum time (in seconds) to wait for the

checkpoint to complete (defaults to `None` = no timeout is enforced).

:return: The checkpoint sequence number.

:raises FelderaAPIError: If enterprise features are not enabled.

"""

seq = self.client.checkpoint_pipeline(self.name)

if not wait:

return seq

start = time.time()

while True:

elapsed = time.monotonic() - start

if timeout_s is not None and elapsed > timeout_s:

raise TimeoutError(

f"""timeout ({timeout_s}s) reached while waiting for \

pipeline '{self.name}' to make checkpoint '{seq}'"""

)

status = self.checkpoint_status(seq)

if status == CheckpointStatus.InProgress:

time.sleep(0.1)

continue

return seq

def checkpoint_status(self, seq: int) -> CheckpointStatus:

"""

Checks the status of the given checkpoint.

:param seq: The checkpoint sequence number.

"""

resp = self.client.checkpoint_pipeline_status(self.name)

success = resp.get("success")

if seq == success:

return CheckpointStatus.Success

fail = resp.get("failure") or {}

if seq == fail.get("sequence_number"):

failure = CheckpointStatus.Failure

failure.error = fail.get("error", "")

return failure

if (success is None) or seq > success:

return CheckpointStatus.InProgress

if seq < success:

return CheckpointStatus.Unknown

def sync_checkpoint(

self, wait: bool = False, timeout_s: Optional[float] = None

) -> str:

"""

Syncs this checkpoint to object store.

:param wait: If true, will block until the checkpoint sync operation

completes.

:param timeout_s: The maximum time (in seconds) to wait for the

checkpoint to complete syncing.

:raises FelderaAPIError: If no checkpoints have been made.

:raises RuntimeError: If syncing the checkpoint fails.

"""

uuid = self.client.sync_checkpoint(self.name)

if not wait:

return uuid

start = time.time()

while True:

elapsed = time.monotonic() - start

if timeout_s is not None and elapsed > timeout_s:

raise TimeoutError(

f"""timeout ({timeout_s}s) reached while waiting for \

pipeline '{self.name}' to sync checkpoint '{uuid}'"""

)

status = self.sync_checkpoint_status(uuid)

if status == CheckpointStatus.Failure:

raise RuntimeError(

f"failed to sync checkpoint '{uuid}': ", status.get_error()

)

if status in [CheckpointStatus.InProgress, CheckpointStatus.Unknown]:

time.sleep(0.1)

continue

break

return uuid

def sync_checkpoint_status(self, uuid: str) -> CheckpointStatus:

"""

Checks the status of the given checkpoint sync operation.

If the checkpoint is currently being synchronized, returns

`CheckpointStatus.Unknown`.

Failures are not raised as runtime errors and must be explicitly

checked.

:param uuid: The checkpoint uuid.

"""

resp = self.client.sync_checkpoint_status(self.name)

success = resp.get("success")

periodic = resp.get("periodic")

fail = resp.get("failure") or {}

if uuid == success or uuid == periodic:

return CheckpointStatus.Success

fail = resp.get("failure") or {}

if uuid == fail.get("uuid"):

failure = CheckpointStatus.Failure

failure.error = fail.get("error", "")

logging.error(f"failed to sync checkpoint '{uuid}': {failure.error}")

return failure

if (success is None) or UUID(uuid) > UUID(success):

return CheckpointStatus.InProgress

return CheckpointStatus.Unknown

def last_successful_checkpoint_sync(self) -> UUID:

"""

Returns the UUID of the last successfully synced checkpoint.

:return: The UUID of the last successfully synced checkpoint.

"""

resp = self.client.sync_checkpoint_status(self.name)

success = resp.get("success")

periodic = resp.get("periodic")

if success is None and periodic is None:

raise RuntimeError("no checkpoints have been synced yet")

elif success is None:

return UUID(periodic)

elif periodic is None:

return UUID(success)

else:

return max(UUID(success), UUID(periodic))

def query(self, query: str) -> Generator[Mapping[str, Any], None, None]:

"""

Executes an ad-hoc SQL query on this pipeline and returns a generator

that yields the rows of the result as Python dictionaries. For

``INSERT`` and ``DELETE`` queries, consider using :meth:`.execute`

instead. All floating-point numbers are deserialized as Decimal objects

to avoid precision loss.

Note:

You can only ``SELECT`` from materialized tables and views.

Important:

This method is lazy. It returns a generator and is not evaluated

until you consume the result.

:param query: The SQL query to be executed.

:return: A generator that yields the rows of the result as Python

dictionaries.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED

state.

:raises FelderaAPIError: If querying a non materialized table or view.

:raises FelderaAPIError: If the query is invalid.

"""

return self.client.query_as_json(self.name, query)

def query_parquet(self, query: str, path: str):

"""

Executes an ad-hoc SQL query on this pipeline and saves the result to

the specified path as a parquet file. If the extension isn't `parquet`,

it will be automatically appended to `path`.

Note:

You can only ``SELECT`` from materialized tables and views.

:param query: The SQL query to be executed.

:param path: The path of the parquet file.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED

state.

:raises FelderaAPIError: If querying a non materialized table or view.

:raises FelderaAPIError: If the query is invalid.

"""

self.client.query_as_parquet(self.name, query, path)

def query_tabular(self, query: str) -> Generator[str, None, None]:

"""

Executes a SQL query on this pipeline and returns the result as a

formatted string.

Note:

You can only ``SELECT`` from materialized tables and views.

Important:

This method is lazy. It returns a generator and is not evaluated

until you consume the result.

:param query: The SQL query to be executed.

:return: A generator that yields a string representing the query result

in a human-readable, tabular format.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED

state.

:raises FelderaAPIError: If querying a non materialized table or view.

:raises FelderaAPIError: If the query is invalid.

"""

return self.client.query_as_text(self.name, query)

def query_hash(self, query: str):

"""

Executes an ad-hoc SQL query on this pipeline and returns the result

as a hash of the result set. This is useful for quickly checking

if the result set has changed without retrieving the entire result.

Note:

For a stable hash, the query must be deterministic which means

it should be sorted.

:param query: The SQL query to be executed.

:raises FelderaAPIError: If the pipeline is not in a RUNNING or PAUSED

state.

:raises FelderaAPIError: If querying a non materialized table or view.

:raises FelderaAPIError: If the query is invalid.

"""

return self.client.query_as_hash(self.name, query)

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

pipeline.py

Latest commit

History

pipeline.py

File metadata and controls