--Step 1 Create the data source
--Need PolyBase installed
CREATE EXTERNAL DATA SOURCE NYCTaxiExternalDataSource
WITH (LOCATION = 'abs://nyctlc@azureopendatastorage.blob.core.windows.net');
GO
SELECT TOP 100
vendorID, tpepPickupDateTime, passengerCount
FROM
OPENROWSET(
BULK 'yellow/*/*/*.parquet',
DATA_SOURCE = 'NYCTaxiExternalDataSource',
FORMAT='parquet'
) AS nyc;
--step 2
SELECT TOP 100
vendorID, tpepPickupDateTime, passengerCount
FROM
OPENROWSET(
BULK 'yellow/*/*/*.parquet',
DATA_SOURCE = 'NYCTaxiExternalDataSource',
FORMAT='PARQUET'
)
WITH (
vendorID varchar(4), -- we're using length of 4 instead of the inferred 8000
tpepPickupDateTime datetime2,
passengerCount int
) AS nyc;
--step 4
--Query all files and project file path and file name information for each row:
SELECT TOP 10 filerows.filepath(1) as [Year_Folder], filerows.filepath(2) as [Month_Folder],
filerows.filename() as [File_name], filerows.filepath() as [Full_Path], *
FROM OPENROWSET(
BULK 'yellow/puYear=*/puMonth=*/*.parquet',
DATA_SOURCE = 'NYCTaxiExternalDataSource',
FORMAT = 'parquet') AS filerows;
--List all paths:
SELECT DISTINCT filerows.filepath(1) as [Year_Folder], filerows.filepath(2) as [Month_Folder]
FROM OPENROWSET(
BULK 'yellow/puYear=*/puMonth=*/*.parquet',
DATA_SOURCE = 'NYCTaxiExternalDataSource',
FORMAT = 'parquet') AS filerows;
--with thanks and acknowledgement to https://learn.microsoft.com/en-us/sql/relational-databases/polybase/polybase-get-started?view=sql-server-ver17&preserve-view=true
--Housekeeping
DROP EXTERNAL DATA SOURCE NYCTaxiExternalDataSource;
uninstall PolyBase if not needed