Saturday, May 23, 2026

SQL 2025 master class LAB 07 PolyBase



--Step 1 Create the data source
--Need PolyBase installed
CREATE EXTERNAL DATA SOURCE NYCTaxiExternalDataSource 
WITH (LOCATION = 'abs://nyctlc@azureopendatastorage.blob.core.windows.net');
GO

 SELECT TOP 100
 vendorID, tpepPickupDateTime, passengerCount 
 FROM 
 OPENROWSET( 
  BULK 'yellow/*/*/*.parquet', 
  DATA_SOURCE = 'NYCTaxiExternalDataSource', 
  FORMAT='parquet' 
 ) AS nyc; 

 --step 2

 SELECT TOP 100
 vendorID, tpepPickupDateTime, passengerCount 
FROM 
 OPENROWSET( 
  BULK 'yellow/*/*/*.parquet', 
  DATA_SOURCE = 'NYCTaxiExternalDataSource', 
  FORMAT='PARQUET' 
 ) 
WITH ( 
 vendorID varchar(4), -- we're using length of 4 instead of the inferred 8000 
 tpepPickupDateTime datetime2, 
 passengerCount int 
 ) AS nyc;

--step 4

--Query all files and project file path and file name information for each row: 

SELECT TOP 10 filerows.filepath(1) as [Year_Folder], filerows.filepath(2) as [Month_Folder], 
filerows.filename() as [File_name], filerows.filepath() as [Full_Path], * 
FROM OPENROWSET( 
 BULK 'yellow/puYear=*/puMonth=*/*.parquet', 
 DATA_SOURCE = 'NYCTaxiExternalDataSource', 
 FORMAT = 'parquet') AS filerows; 

--List all paths: 
SELECT DISTINCT filerows.filepath(1) as [Year_Folder], filerows.filepath(2) as [Month_Folder] 
FROM OPENROWSET( 
 BULK 'yellow/puYear=*/puMonth=*/*.parquet', 
 DATA_SOURCE = 'NYCTaxiExternalDataSource', 
 FORMAT = 'parquet') AS filerows; 

 --with thanks and acknowledgement to https://learn.microsoft.com/en-us/sql/relational-databases/polybase/polybase-get-started?view=sql-server-ver17&preserve-view=true 

 --Housekeeping

DROP EXTERNAL DATA SOURCE NYCTaxiExternalDataSource; 

uninstall PolyBase if not needed