This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris-website.git
The following commit(s) were added to refs/heads/master by this push: new cc6bf57d177 [blog] Update TikTok Blog (#1106) cc6bf57d177 is described below commit cc6bf57d1772b3be328964597553fc89784a53a2 Author: KassieZ <139741991+kass...@users.noreply.github.com> AuthorDate: Thu Sep 12 21:56:26 2024 +0800 [blog] Update TikTok Blog (#1106) # Versions - [ ] dev - [ ] 3.0 - [ ] 2.1 - [ ] 2.0 # Languages - [ ] Chinese - [x] English --- ...d-built-a-pb-scale-logging-system-with-doris.md | 2 +- blog/auto-partition-in-apache-doris.md | 2 +- ...time-data-warehouse-in-TikTok-based-on-doris.md | 104 +++++++++++++++++++++ src/constant/newsletter.data.ts | 15 ++- static/images/OLAP-engine.PNG | Bin 0 -> 570413 bytes .../real-time-data-warehouse-in-TikTok-2.PNG | Bin 0 -> 492943 bytes .../images/real-time-data-warehouse-in-TikTok.PNG | Bin 0 -> 427171 bytes ...a-warehouse-in-tiktok-based-on-apache-doris.png | Bin 0 -> 1102809 bytes static/images/real-time-ranking-board.png | Bin 0 -> 303881 bytes 9 files changed, 113 insertions(+), 10 deletions(-) diff --git a/blog/ai-unicorn-minimax-from-loki-and-built-a-pb-scale-logging-system-with-doris.md b/blog/ai-unicorn-minimax-from-loki-and-built-a-pb-scale-logging-system-with-doris.md index 3a72eb871ea..2f894b22bcc 100644 --- a/blog/ai-unicorn-minimax-from-loki-and-built-a-pb-scale-logging-system-with-doris.md +++ b/blog/ai-unicorn-minimax-from-loki-and-built-a-pb-scale-logging-system-with-doris.md @@ -7,7 +7,7 @@ 'author': 'Apache Doris', 'tags': ['Best Practice'], 'picked': "true", - 'order': "2", + 'order': "3", "image": '/images/minimax-migrated-from-loki-to-doris.png' } diff --git a/blog/auto-partition-in-apache-doris.md b/blog/auto-partition-in-apache-doris.md index 8ebd505ec35..5a774278ff3 100644 --- a/blog/auto-partition-in-apache-doris.md +++ b/blog/auto-partition-in-apache-doris.md @@ -7,7 +7,7 @@ 'author': 'Apache Doris', 'tags': ['Tech Sharing'], 'picked': "true", - 'order': "3", + 'order': "4", "image": '/images/auto-partition-in-apache-doris.jpg' } diff --git a/blog/real-time-data-warehouse-in-TikTok-based-on-doris.md b/blog/real-time-data-warehouse-in-TikTok-based-on-doris.md new file mode 100644 index 00000000000..c36c0915d92 --- /dev/null +++ b/blog/real-time-data-warehouse-in-TikTok-based-on-doris.md @@ -0,0 +1,104 @@ +--- +{ + 'title': "Real-time data warehouse in TikTok based on Apache Doris", + 'description': "We are excited to invite the data platform team of TikTok to talk about how they use Apache Doris in their real-time data architecture and how they benefit from it, which could serve as a model for effective real-time data warehousing.", + 'summary': "We are excited to invite the data platform team of TikTok to talk about how they use Apache Doris in their real-time data architecture and how they benefit from it, which could serve as a model for effective real-time data warehousing.", + 'date': '2024-09-12', + 'author': 'Data Platform Team of TikTok', + 'tags': ['Best Practice'], + 'picked': "true", + 'order': "2", + "image": '/images/real-time-data-warehouse-in-tiktok-based-on-apache-doris.png' +} + +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + + +Live streaming and e-commerce are among the biggest revenue drivers of TikTok, and they both rely on real-time data processing. It is more challenging than offline batch processing because it involves complicated operations like multi-stream JOINs and dimension table changes. It requires a higher level of development and maintenance input, and due to the need for system stability guarantee, it often leads to resource redundancy and waste. + +We are excited to invite the data platform team of TikTok to talk about how they use [Apache Doris](https://doris.apache.org) in their real-time data architecture and how they benefit from it, which could serve as a model for effective real-time data warehousing. + +## Real-time data warehouse in TikTok + + + +Before migration to Apache Doris, the real-time data was transferred by Flink, with Kafka enabling data flow between different data layers. Because Kafka itself does not have a logical table, it was hard to develop on it as easily as on Hive. + +For TikTok, there is a significant gap in the data volume between real-time data and offline data. Due to the costs in development, operation, and resources associated with real-time data, the team tended to downgrade the real-time data requirements, but it was only a duct-tape solution. + +- **Development costs**: Since Flink is a stateful data stream engine with incremental state, it requires developers to have better knowledge of the underlying architecture, especially in multi-stream JOINs. The incremental state makes it impossible to store the full data state in memory like Hive. Real-time data requires a huge storage volume and the use of various computing engines (OLTP engines like MySQL, OLAP engines like ClickHouse and Apache Doris, and KV stores like Abase, Tier, [...] + +- **Maintenance costs**: Complex multi-stream JOIN operations often require storing large amounts of state data, which can lead to stability issues, especially when processing continuous live streams. The live stream business of TikTok is under continuous innovation. When there is a data schema change, direct deployment might cause data recovery failure due to altered state structures. + +- **Resources**: Under-utilization of resources is a common issue in real-time scenarios. For example, at the beginning of a sales campaign, there is often a tidal surge of traffic, which will quickly decrease after a few minutes. However, to ensure stability throughout the campaign, a high resource level must be maintained 24/7, leading to resource waste. + +TikTok was looking for a solution that provides the same convenience as offline data development and reduces resource costs. Additionally, since the intermediate state data of Flink is inaccessible, it will be great that the new solution can enable quick and easy data testing, which is a prerequisite for data accuracy. + + + +The Flink-based architecture has been a mature solution in TikTok. It was primarily used for well-established business applications. In terms of data storage, it utilized the logical table format provided by Kafka. Despite the lack of fields, constraints, and high data traceability, this logical table approach supported over half of the real-time data development. + +The new architecture is based on Apache Doris. It is simpler and similar to an offline Hive setup. The key to this Doris-based architecture is the combination of the sub-second scheduling engine with the OLAP engine. This enables data layering and the reuse of offline development. + +## OLAP engine + +To serve the live streaming business in TikTok, the OLAP engine should be well-performed in: + +- **Cross-site disaster recovery**: This provides stability guarantee for live streaming to avoid significant financial losses caused by service unavailability. + +- **Read-write isolation**: This is another guarantee of stability. + +- **Cross-cluster ETL**: Data is scattered across different clusters for different business scenarios. For example, Cluster B and C both process transaction data, which should be synchronized from Cluster B to C, otherwise it will lead to duplicate data warehouse construction across business lines and impose burden on manpower and resources. + + + +This is how TikTok addresses these challenges: + +- **Cross-site disaster recovery**: Each table is stored with three replicas. The replicas are distributed across the data centers to ensure availability in each site. Message queues from the production end go through intermediate processing and arrive at the consumption end, forming the full data service link. In the event of a single data center outage, there are policies in place for both production and consumption to ensure service efficiency and stability. + +- **Read-write isolation**: The read and write traffic is routed to different cluster groups. + +- **Cross-cluster ETL**: For cross-cluster reads and writes, TikTok employs two mechanisms based on different business requirements and time sensitivity. One is to use Spark to read the data source format into a Yarn cluster and then synchronize it to the other clusters. The other is to leverage the [cross-cluster replication](https://doris.apache.org/docs/admin-manual/data-admin/ccr) capability of Apache Doris. The Spark on Doris method is more stable and does not consume the computing [...] + +## Real-time ranking board + +How is the real-time data warehouse supporting the live streaming business of TikTok? + +It builds a real-time ranking board to monitor its live streaming business performance. As is mentioned, it was migrated from Flink to Apache Doris. The new solution has a clear definition of metadata. + + + +The metadata is parsed from the fields in the real-time tables and then given definitions. Defining the metadata is a way to abstract the business logic of the ranking board. + +This also involves defining the partitioning logic of the real-time ranking board. By simple configurations, the corresponding Flink tasks can be quickly created. + +However, the need for such real-time ranking boards proliferates and challenges the Flink architecture in two ways: + +- Firstly, the excessive number of ranking boards causes a surge in tasks, making resource management more difficult, especially real-time stream processing that runs 24/7. + +- Secondly, alerts from real-time tasks are getting more frequent. Also, the large number of tasks consuming the same message queue amplifies traffic, imposing an additional burden on HDFS. + +Moreover, since large promotional events in e-commerce tend to run for a long period, the long-cycle computation is a threat to the stability of Flink. **This also makes backtracking difficult.** To address these issues, maintenance personnel often need to perform operations at midnight when the state is relatively small and backtracking pressure is reduced. + +Compared to the solution on Flink, the Doris-based data warehouse consumes less resources and generates less alerts. Plus, **because the states are stored in Doris tables, long-cycle computation becomes more flexible**. + +Join Apache Doris community on [Slack](https://join.slack.com/t/apachedoriscommunity/shared_invite/zt-2gmq5o30h-455W226d79zP3L96ZhXIoQ)! \ No newline at end of file diff --git a/src/constant/newsletter.data.ts b/src/constant/newsletter.data.ts index dfeb8360824..d849446d97e 100644 --- a/src/constant/newsletter.data.ts +++ b/src/constant/newsletter.data.ts @@ -6,6 +6,13 @@ export const NEWSLETTER_DATA = [ to: '/blog/release-note-2.1.6', image: '2.1.6.jpg', }, + { + tags: ['Best Practice'], + title: "Real-time data warehouse in TikTok based on Apache Doris", + content: `We are excited to invite the data platform team of TikTok to talk about how they use Apache Doris in their real-time data architecture and how they benefit from it, which could serve as a model for effective real-time data warehousing.`, + to: '/blog/real-time-data-warehouse-in-TikTok-based-on-doris', + image: 'real-time-data-warehouse-in-tiktok-based-on-apache-doris.png', + }, { tags: ['Top News'], title: "How AI unicorn MiniMax migrated from Loki and built a PB-scale logging system with Apache Doris", @@ -21,13 +28,5 @@ export const NEWSLETTER_DATA = [ image: 'auto-partition-in-apache-doris.jpg', }, - { - tags: ['Best Practice'], - title: "Migrate data lakehouse from BigQuery to Apache Doris, saving $4,500 per month ", - content: `Dien Tran Thanh shares his firsthand experience, proven best practices, and insightful suggestions for migrating from BigQuery to Apache Doris to build a more cost-efficient data warehouse.`, - to: '/blog/migrate-lakehouse-from-bigquery-to-doris', - image: 'migrate-lakehouse-from-bigquery-to-apache-doris.jpg', - }, - ]; diff --git a/static/images/OLAP-engine.PNG b/static/images/OLAP-engine.PNG new file mode 100644 index 00000000000..36fdfcbcebd Binary files /dev/null and b/static/images/OLAP-engine.PNG differ diff --git a/static/images/real-time-data-warehouse-in-TikTok-2.PNG b/static/images/real-time-data-warehouse-in-TikTok-2.PNG new file mode 100644 index 00000000000..4c5a7a5f6ba Binary files /dev/null and b/static/images/real-time-data-warehouse-in-TikTok-2.PNG differ diff --git a/static/images/real-time-data-warehouse-in-TikTok.PNG b/static/images/real-time-data-warehouse-in-TikTok.PNG new file mode 100644 index 00000000000..658e15d010a Binary files /dev/null and b/static/images/real-time-data-warehouse-in-TikTok.PNG differ diff --git a/static/images/real-time-data-warehouse-in-tiktok-based-on-apache-doris.png b/static/images/real-time-data-warehouse-in-tiktok-based-on-apache-doris.png new file mode 100644 index 00000000000..b62362a7f50 Binary files /dev/null and b/static/images/real-time-data-warehouse-in-tiktok-based-on-apache-doris.png differ diff --git a/static/images/real-time-ranking-board.png b/static/images/real-time-ranking-board.png new file mode 100644 index 00000000000..67ded02e021 Binary files /dev/null and b/static/images/real-time-ranking-board.png differ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org