Serverless data migration from TiDB Cloud to REDSHIFT

Serverless data migration
  • S3 bucket preparation
  1. In the lambda source code the dumpling binary is downloaded to export data from TiDB Cloud. Keeping the dumpling binary in the S3 is to limit the source code size.
  2. To deploy the python lambda and layer, push them to S3 bucket firstly, from that it is deployed to lambda.
  3. Lastly, the data is exported to S3 from TiDB Cloud without disk planning and from S3 the data is imported into REDSHIFT.
$ aws s3api create-bucket --bucket tidbcloudredshift --create-bucket-configuration LocationConstraint=ap-northeast-1 --region ap-northeast-1
{
"Location": "http://tidbcloudredshift.s3.amazonaws.com/"
}

$ aws s3api put-object --bucket tidbcloudredshift --key data/
{
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\""
}

$ aws s3api put-object --bucket tidbcloudredshift --key source_code/
{
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\""
}
$ aws s3api put-object --bucket tidbcloudredshift --key lib/
{
"ETag": "\"d41d8cd98f00b204e9800998ecf8427e\""
}
  • Download and push TiDB tools(Dumpling binary) to S3
$ wget https://download.pingcap.org/tidb-toolkit-v5.4.0-linux-amd64.tar.gz
$ tar xvf tidb-toolkit-v5.4.0-linux-amd64.tar.gz
tidb-toolkit-v5.4.0-linux-amd64/
tidb-toolkit-v5.4.0-linux-amd64/bin/
tidb-toolkit-v5.4.0-linux-amd64/bin/br
tidb-toolkit-v5.4.0-linux-amd64/bin/tidb-lightning
tidb-toolkit-v5.4.0-linux-amd64/bin/mydumper
tidb-toolkit-v5.4.0-linux-amd64/bin/sync_diff_inspector
tidb-toolkit-v5.4.0-linux-amd64/bin/pd-tso-bench
tidb-toolkit-v5.4.0-linux-amd64/bin/tidb-lightning-ctl
tidb-toolkit-v5.4.0-linux-amd64/bin/dumpling
$ aws s3 sync ./tidb-toolkit-v5.4.0-linux-amd64 s3://tidbcloudredshift/lib/
  • Create policy for S3 access
$ more policy4redshift.json 
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:GetObjectVersion"
],
"Resource": "arn:aws:s3:::tidbcloudredshift/*"
},
{
"Effect": "Allow",
"Action": [
"s3:ListBucket",
"s3:GetBucketLocation"
],
"Resource": "arn:aws:s3:::tidbcloudredshift"
}
]
}
$ aws iam create-policy --policy-name policy4tidbcloudredshift --policy-document file://policy4redshift.json
{
"Policy": {
"PolicyName": "policy4tidbcloudredshift",
"PolicyId": "ANPAVTR2JPDXDMAII3MFL",
"Arn": "arn:aws:iam::3855dddddddd:policy/policy4tidbcloudredshift",
"Path": "/",
"DefaultVersionId": "v1",
"AttachmentCount": 0,
"PermissionsBoundaryUsageCount": 0,
"IsAttachable": true,
"CreateDate": "2022-03-29T08:32:56Z",
"UpdateDate": "2022-03-29T08:32:56Z"
}
}
  1. Create role for lambda
$ aws iam create-role --role-name role4lambda --assume-role-policy-document '{"Version": "2012-10-17","Statement": [{ "Effect": "Allow", "Principal": {"Service": "lambda.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
{
"Role": {
"Path": "/",
"RoleName": "role4lambda",
"RoleId": "AROAVTR2JPDXFKAUWLSOG",
"Arn": "arn:aws:iam::3855dddddddd:role/role4lambda",
"CreateDate": "2022-03-29T08:45:18Z",
"AssumeRolePolicyDocument": {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "lambda.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
}
}

$ aws iam attach-role-policy --role-name role4lambda --policy-arn arn:aws:iam::3855dddddddd:policy/policy4tidbcloudredshift
$ aws iam attach-role-policy --role-name role4lambda --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole
$ aws iam create-role --role-name role4redshift --assume-role-policy-document '{"Version": "2012-10-17","Statement": [{ "Effect": "Allow", "Principal": {"Service": "redshift.amazonaws.com"}, "Action": "sts:AssumeRole"}]}'
{
"Role": {
"Path": "/",
"RoleName": "role4redshift",
"RoleId": "AROAVTR2JPDXJ2OJNYUCW",
"Arn": "arn:aws:iam::3855dddddddd:role/role4redshift",
"CreateDate": "2022-03-29T08:50:51Z",
"AssumeRolePolicyDocument": {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "redshift.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
}
}
$ aws iam attach-role-policy --role-name role4redshift --policy-arn arn:aws:iam::3855dddddddd:policy/policy4tidbcloudredshift
  • Create endpoints on both lambda’s VPC and redshift’s VPC
  • Lambda layer preparation
$ mkdir python
$ pip3 install sqlalchemy -t $(pwd)/python/
$ pip3 install pymysql -t $(pwd)/python/
$ git clone https://github.com/jkehler/awslambda-psycopg2.git
$ cp -r awslambda-psycopg2/psycopg2-3.7 python/psycopg2
$ zip -r lambda-layer.zip ./python
$ aws s3 cp ./lambda-layer.zip s3://tidbcloudredshift/source_code/
upload: ./lambda-layer.zip to s3://tidbcloudredshift/source_code/lambda-layer.zip
$ aws lambda publish-layer-version --layer-name tidbcloud2redshift --description "Demo for data copy" \
--license-info "MIT" --content S3Bucket=tidbcloudredshift,S3Key=source_code/lambda-layer.zip \
--compatible-runtimes python3.7
{
"Content": {
"Location": "https://awslambda-ap-ne-1-layers.s3.ap-northeast-1.amazonaws.com/snapshots/3855......../tidbcloud2redshift-958f2e8d-53d0-4722-9737-e4a9edd6a674?versionId=... ...Token=... ... Signature=... ...",
"CodeSha256": "2uospbQd7xxeINucUnw0gxUVpA7thXGhNcARwM4+w0E=",
"CodeSize": 4550209
},
"LayerArn": "arn:aws:lambda:ap-northeast-1:3855dddddddd:layer:tidbcloud2redshift",
"LayerVersionArn": "arn:aws:lambda:ap-northeast-1:3855dddddddd:layer:tidbcloud2redshift:1",
"Description": "Demo for data copy",
"CreatedDate": "2022-03-29T09:29:28.787+0000",
"Version": 1,
"CompatibleRuntimes": [
"python3.7"
],
"LicenseInfo": "MIT"
}
  • Redshift setup
  • Lambda function
admin@ip-172-81-11-88:~$ cd lambda/
admin@ip-172-81-11-88:~/lambda$ zip -q -r lambda.zip *
admin@ip-172-81-11-88:~/lambda$ ls
lambda.zip main.py
admin@ip-172-81-11-88:~/lambda$ aws lambda create-function \
--function-name tidb2redshift \
--runtime python3.7 \
--zip-file fileb://lambda.zip \
--handler main.lambda_handler \
--vpc-config SubnetIds=subnet-065c16a37ad39cda4,SecurityGroupIds=sg-02a5af8d81b32cd8e \
--layers "arn:aws:lambda:ap-northeast-1:3855dddddddd:layer:tidbcloud2redshift:1" \
--environment Variables="{BUCKET_NAME=tidbcloudredshift,RD_HOST='tidbcloud2redshift.c0etc6q3cp6v.ap-northeast-1.redshift.amazonaws.com',RD_NAME=dev,RD_PASS=1234Abcd,RD_PORT=5439,RD_USER=awsuser,REDSHIFT_ROLE='arn:aws:iam::3855dddddddd:role/role4redshift',S3_DATA_FOLDER=data,S3_LIB_FOLDER='lib/bin',S3_REGION='ap-northeast-1',TiDB_HOST=172.30.91.140,TiDB_PASS=1234Abcd,TiDB_PORT=4000,TiDB_USER=root}" \
--role arn:aws:iam::3855dddddddd:role/role4lambda

{ [6/4619]
"FunctionName": "tidb2redshift",
"FunctionArn": "arn:aws:lambda:ap-northeast-1:385595570414:function:tidb2redshift",
"Runtime": "python3.7",
"Role": "arn:aws:iam::3855dddddddd:role/role4lambda",
"Handler": "main.lambda_handler",
"CodeSize": 1372,
"Description": "",
"Timeout": 3,
"MemorySize": 128,
"LastModified": "2022-03-31T04:00:46.197+0000",
"CodeSha256": "GRHC8bm8z7f7jlwG1kUFk7TMzYZWDqymySmpsJn3xNA=",
"Version": "$LATEST",
"VpcConfig": {
"SubnetIds": [
"subnet-065c16a37ad39cda4"
],
"SecurityGroupIds": [
"sg-02a5af8d81b32cd8e"
],
"VpcId": "vpc-08b59bfa6398897a2"
},
"Environment": {
"Variables": {
"TiDB_HOST": "172.30.91.140",
"S3_REGION": "ap-northeast-1",
"RD_PORT": "5439",
"TiDB_PASS": "1234Abcd",
"TiDB_USER": "root",
"REDSHIFT_ROLE": "arn:aws:iam::385595570414:role/role4redshift",
"RD_HOST": "tidbcloud2redshift.c0etc6q3cp6v.ap-northeast-1.redshift.amazonaws.com",
"S3_DATA_FOLDER": "data",
"RD_USER": "awsuser",
"RD_PASS": "1234Abcd",
"TiDB_PORT": "4000",
"S3_LIB_FOLDER": "lib/bin",
"BUCKET_NAME": "tidbcloudredshift",
"RD_NAME": "dev"
}
},
"TracingConfig": {
"Mode": "PassThrough"
},
"RevisionId": "6896e2d3-7aad-4964-88b3-a42a2ff25f68",
"Layers": [
{
"Arn": "arn:aws:lambda:ap-northeast-1:385595570414:layer:tidbcloud2redshift:1",
"CodeSize": 4550209
}
]
}
  • Extend the timeout of the lambda to 1 minute
  • event setup to call data sync

Conclusion

  • No EC2 server
  • No disk planning
  • No scheduler

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store