Stephan Köppen

One of the first things in any PDW Project, is the migration of the existing data into the PDW. This could be necessary for POCs, Development Issues or the final move towards the new production system.

Since the PDW is optimized for Star-Schema, we will most likely have non partitioned dimension tables and partitioned fact tables.

 

Exporting Data

To split the existing data, we will export all non partitioned tables into one CSV File and the partitioned tables are split for each filled partition.

The tool of choise for the SMP enviroment is the bcp (bulk copy program) tool. Bcp is a commandline tool delivered with the SQL-Server and is usually installed on the server itself.

To create all the necessary statements, we will select the database we want to export, change the parameters, and execute the script.

NOTE: In our example we imply that the partition columns are based on int values.


-- START Parameters
DECLARE @path VARCHAR(100) = ‚C:\Export\‘
DECLARE @delimiter VARCHAR(100) = ‚|‘
-- END Parameters
DECLARE @table NVARCHAR(100)
DECLARE @partitionvalue SQL_VARIANT
DECLARE @partitioncolumn SQL_VARIANT
DECLARE @string NVARCHAR(max)
DECLARE @database VARCHAR(50) = Db_name(Db_id())
DECLARE @server VARCHAR(20) = @@SERVERNAME
DECLARE cur_tabelle CURSOR fast_forward FOR
  SELECT o.name                                    AS Tabelle,
         Cast(Isnull(prv_left.value, 0) AS INT)    AS PartitionValue,
         Cast(Isnull(c.name, ) AS NVARCHAR(100)) AS PartitionColumn
  FROM   sys.objects AS o
         LEFT JOIN sys.partitions AS p
                ON o.object_id = p.object_id
         LEFT JOIN sys.indexes AS i
                ON i.object_id = p.object_id
                   AND i.index_id = p.index_id
         LEFT JOIN sys.data_spaces AS ds
                ON ds.data_space_id = i.data_space_id
         LEFT JOIN sys.partition_schemes AS ps
                ON ps.data_space_id = ds.data_space_id
         LEFT JOIN sys.partition_functions AS pf
                ON pf.function_id = ps.function_id
         LEFT JOIN sys.destination_data_spaces AS dds2
                ON dds2.partition_scheme_id = ps.data_space_id
                   AND dds2.destination_id = p.partition_number
         LEFT JOIN sys.filegroups AS fg
                ON fg.data_space_id = dds2.data_space_id
         LEFT OUTER JOIN sys.partition_range_values AS prv_left
                      ON ps.function_id = prv_left.function_id
                         AND prv_left.boundary_id = p.partition_number  1
         LEFT OUTER JOIN sys.index_columns ic
                      ON --ic.partition_ordinal
         ic.index_id = i.index_id
         AND ic.object_id = o.object_id
         LEFT JOIN sys.columns c
                ON c.object_id = ic.object_id
                   AND c.column_id = ic.column_id
  WHERE  o.type = ‚U‘
         AND p.rows != 0

OPEN cur_tabelle

FETCH next FROM cur_tabelle INTO @table, @PartitionValue, @PartitionColumn

WHILE @@fetch_status = 0
  BEGIN
      IF @PartitionValue = 0
        BEGIN
            SET @string = ‚bcp "select * from ‚ + @table
                          + ‚" queryout "‘ + @path + @table + ‚.csv" -S ‚
                          + @server + ‚ -T -C RAW -d ‚ + @database + ‚ -t"‘
                          + @delimiter + ‚" -c‘
        END
      ELSE
        BEGIN
            SET @string = ‚bcp "select * from ‚ + @table + ‚ where ‚
                          + Cast(@PartitionColumn AS NVARCHAR(100))
                          + ‚ = ‚
                          + Cast (@PartitionValue AS NVARCHAR(10))
                          + ‚" queryout "‘ + @path + @table + ‚_‘
                          + Cast (@PartitionValue AS NVARCHAR(100))
                          + ‚.csv" -S ‚ + @server + ‚ -T -C RAW -d ‚
                          + @database + ‚ -t"‘ + @delimiter + ‚" -c‘
        END

      PRINT @string

      FETCH next FROM cur_tabelle INTO @table, @PartitionValue, @PartitionColumn
  END

CLOSE cur_tabelle

DEALLOCATE cur_tabelle 

The now created command line statements can easily be put into a regular batch file and be executed. It is also possible to split the result into multiple files to export in parallel.

 

Importing Data 

As soon as all the necessary files are copied from the source to the Landingzone of the PDW, we can easily import the Data via DWHLoader

Since the PDW doesn’t support AD Logins, it is necessary to pass over some login information as well.

The below shown script will also be executed on the source system in order to create the necessary import scripts.

NOTE: In our example we imply that the database name stays the same and that the data model is an exact copy of the exported database.


-- START Parameters
DECLARE @path VARCHAR(100) = ‚C:\Import\‘
DECLARE @delimiter VARCHAR(100) = ‚|‘
DECLARE @login VARCHAR(100) = ‚username‘
DECLARE @password VARCHAR(100) = ‚password‘
-- END Parameters
DECLARE @table NVARCHAR(100)
DECLARE @partitionvalue SQL_VARIANT
DECLARE @partitioncolumn SQL_VARIANT
DECLARE @string NVARCHAR(max)
DECLARE @database VARCHAR(50) = Db_name(Db_id())
DECLARE @server VARCHAR(20) = @@SERVERNAME
DECLARE cur_tabelle CURSOR fast_forward FOR
  SELECT o.name                                    AS Tabelle,
         Cast(Isnull(prv_left.value, 0) AS INT)    AS PartitionValue,
         Cast(Isnull(c.name, ) AS NVARCHAR(100)) AS PartitionColumn
  FROM   sys.objects AS o
         LEFT JOIN sys.partitions AS p
                ON o.object_id = p.object_id
         LEFT JOIN sys.indexes AS i
                ON i.object_id = p.object_id
                   AND i.index_id = p.index_id
         LEFT JOIN sys.data_spaces AS ds
                ON ds.data_space_id = i.data_space_id
         LEFT JOIN sys.partition_schemes AS ps
                ON ps.data_space_id = ds.data_space_id
         LEFT JOIN sys.partition_functions AS pf
                ON pf.function_id = ps.function_id
         LEFT JOIN sys.destination_data_spaces AS dds2
                ON dds2.partition_scheme_id = ps.data_space_id
                   AND dds2.destination_id = p.partition_number
         LEFT JOIN sys.filegroups AS fg
                ON fg.data_space_id = dds2.data_space_id
         LEFT OUTER JOIN sys.partition_range_values AS prv_left
                      ON ps.function_id = prv_left.function_id
                         AND prv_left.boundary_id = p.partition_number  1
         LEFT OUTER JOIN sys.index_columns ic
                      ON --ic.partition_ordinal
         ic.index_id = i.index_id
         AND ic.object_id = o.object_id
         LEFT JOIN sys.columns c
                ON c.object_id = ic.object_id
                   AND c.column_id = ic.column_id
  WHERE  o.type = ‚U‘
         AND p.rows != 0

OPEN cur_tabelle

FETCH next FROM cur_tabelle INTO @table, @PartitionValue, @PartitionColumn

WHILE @@fetch_status = 0
  BEGIN
      IF @PartitionValue = 0
        BEGIN
            SET @string =
‚"C:\Program Files\Microsoft SQL Server Parallel Data Warehouse\100\dwloader.exe" -U ‚
+ @login + ‚ -P ‚ + @password + ‚ -T ‚ + @database
+ ‚.dbo.‘ + @table + ‚ -i "‘ + @path + @table
+ ‚.csv" -R "‘ + @path + @table
+ ‚.reject.txt" -E -M append -fh 0 -e ASCII -rt value -rv 0 -t "‘
+ @delimiter + ‚" -r 0x0d0x0a‘
END
ELSE
  BEGIN
      SET @string =
‚"C:\Program Files\Microsoft SQL Server Parallel Data Warehouse\100\dwloader.exe" -U ‚
+ @login + ‚ -P ‚ + @password + ‚ -T ‚ + @database
+ ‚.dbo.‘ + @table + ‚ -i "‘ + @path + @table + ‚_‘
+ Cast (@PartitionValue AS NVARCHAR(100))
+ ‚.csv" -R "‘ + @path + @table
+ ‚.reject.txt" -E -M fastappend -fh 0 -e ASCII -rt value -rv 0 -t "‘
+ @delimiter + ‚" -r 0x0d0x0a‘
END

    PRINT @string

    FETCH next FROM cur_tabelle INTO @table, @PartitionValue, @PartitionColumn
END

CLOSE cur_tabelle

DEALLOCATE cur_tabelle 

As already described after the export, the now created statements can be put into a batch file and let the DWLoader to the rest.