Skip to content

tzdb.cpp: Consider directly accessing ICU's time zone info, instead of going through the ucal_ API #6158

@cpplearner

Description

@cpplearner

Currently, MSVC STL relies on ICU's UCalendar API to compute time zone information. This is convenient, but slow.

Instead, we can directly access ICU's time zone data (with ICU's ures_ API), and do the computation ourselves. I have a proof of concept that implements something equivalent to time_zone::get_info(sys_time) (except that it doesn't compute the abbrev), and it shows a 60x to 80x speedup compared to the original get_info.

This is complicated, but maybe the speedup is worth it. Plus, people are not satisfied with the current performance (#2842).

My proof of concept
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <map>
#include <memory>
#include <print>
#include <ranges>
#include <span>
#include <string>
#include <string_view>

#include <icu.h>

struct ures_closer {
    static void operator()(UResourceBundle* p) {
        ures_close(p);
    };
};

using ures_ptr = std::unique_ptr<UResourceBundle, ures_closer>;

struct time_zone_data {
    time_zone_data() = default;
    time_zone_data(const time_zone_data&) = delete;
    time_zone_data& operator=(const time_zone_data&) = delete;

    std::span<const int32_t> trans;
    std::span<const int32_t> transPre32;
    std::span<const int32_t> transPost32;
    std::span<const int32_t> typeOffsets;
    std::span<const uint8_t> typeMap;
    int32_t startMonth;
    int32_t startDay;
    int32_t startDayOfWeek;
    int32_t startTime;
    int32_t startTimeMode;
    int32_t endMonth;
    int32_t endDay;
    int32_t endDayOfWeek;
    int32_t endTime;
    int32_t endTimeMode;
    int32_t finalDstOffset;
    int32_t finalRawOffset;
};

struct tzdb_data {
    ures_ptr zoneinfo_res;
    std::vector<std::string> zone_names;
    std::vector<std::unique_ptr<time_zone_data>> zone_data;
};

tzdb_data* get_tzdb_data(UErrorCode& status) {
    static constinit tzdb_data result;
    if (!result.zoneinfo_res) {
        result.zoneinfo_res.reset(ures_openDirect(nullptr, "zoneinfo64", &status));
        if (U_FAILURE(status)) {
            return nullptr;
        }

        const ures_ptr names_res{ures_getByKey(result.zoneinfo_res.get(), "Names", nullptr, &status)};
        if (U_FAILURE(status)) {
            return nullptr;
        }

        const int32_t names_size = ures_getSize(names_res.get());
        result.zone_names.resize(names_size);
        result.zone_data.resize(names_size);
        for (int32_t i = 0; i < names_size; ++i) {
            int32_t name_length;
            const char16_t* const name = ures_getStringByIndex(names_res.get(), i, &name_length, &status);
            if (U_FAILURE(status)) {
                return nullptr;
            }

            std::string s;
            s.resize_and_overwrite(name_length, [&](char* p, std::string::size_type) {
                for (int32_t i = 0; i < name_length; ++i) {
                    p[i] = static_cast<char>(name[i]);
                }

                return name_length;
            });

            result.zone_names[i] = s;
        }
    }

    return &result;
}

std::span<const int32_t> get_int_vector_as_span(UResourceBundle* res, UErrorCode& status) {
    int32_t size;
    const int32_t* const p = ures_getIntVector(res, &size, &status);
    if (U_FAILURE(status)) {
        return {};
    }

    return {p, static_cast<std::size_t>(size)};
}

time_zone_data* get_time_zone_data(tzdb_data* const db, const std::string_view tz_name, UErrorCode& status) {
    if (!db) {
        return nullptr;
    }

    const auto idx = std::ranges::lower_bound(db->zone_names, tz_name) - db->zone_names.begin();
    if (idx == db->zone_names.size() || tz_name != db->zone_names[idx]) {
        return nullptr;
    }

    if (db->zone_data[idx]) {
        return db->zone_data[idx].get();
    }

    std::unique_ptr<time_zone_data> p{new time_zone_data};

    const ures_ptr zones_res{ures_getByKey(db->zoneinfo_res.get(), "Zones", nullptr, &status)};
    const ures_ptr zone_res{ures_getByIndex(zones_res.get(), idx, nullptr, &status)};
    if (U_FAILURE(status)) {
        return nullptr;
    }

    const ures_ptr res{ures_getByKey(zone_res.get(), "trans", nullptr, &status)};
    p->trans = get_int_vector_as_span(res.get(), status);
    if (status == U_MISSING_RESOURCE_ERROR) {
        status = U_ZERO_ERROR;
    }

    ures_getByKey(zone_res.get(), "transPre32", res.get(), &status);
    p->transPre32 = get_int_vector_as_span(res.get(), status);
    if (status == U_MISSING_RESOURCE_ERROR) {
        status = U_ZERO_ERROR;
    }

    ures_getByKey(zone_res.get(), "transPost32", res.get(), &status);
    p->transPost32 = get_int_vector_as_span(res.get(), status);
    if (status == U_MISSING_RESOURCE_ERROR) {
        status = U_ZERO_ERROR;
    }

    ures_getByKey(zone_res.get(), "typeOffsets", res.get(), &status);
    p->typeOffsets = get_int_vector_as_span(res.get(), status);

    ures_getByKey(zone_res.get(), "typeMap", res.get(), &status);
    int32_t type_map_size;
    const uint8_t* const type_map_data = ures_getBinary(res.get(), &type_map_size, &status);

    if (p->transPre32.size() % 2 != 0 || p->transPost32.size() % 2 != 0 || p->typeOffsets.size() % 2 != 0) {
        return nullptr;
    }

    if (U_FAILURE(status)) {
        return nullptr;
    }

    p->typeMap = std::span{type_map_data, type_map_data + type_map_size};

    ures_getByKey(zone_res.get(), "finalRule", res.get(), &status);
    if (status == U_MISSING_RESOURCE_ERROR) {
        status = U_ZERO_ERROR;
        p->finalDstOffset = 0;
    } else {
        int32_t rule_name_size;
        const char16_t* const rule_name_data = ures_getString(res.get(), &rule_name_size, &status);
        if (U_FAILURE(status)) {
            return nullptr;
        }

        std::string s;
        s.resize_and_overwrite(rule_name_size, [&](char* p, std::string::size_type) {
            for (int32_t i = 0; i < rule_name_size; ++i) {
                p[i] = static_cast<char>(rule_name_data[i]);
            }

            return rule_name_size;
        });

        const ures_ptr rules_res{ures_getByKey(db->zoneinfo_res.get(), "Rules", nullptr, &status)};
        ures_getByKey(rules_res.get(), s.data(), res.get(), &status);
        const auto rule = get_int_vector_as_span(res.get(), status);
        if (rule.size() != 11) {
            return nullptr;
        }

        p->startMonth     = rule[0];
        p->startDay       = rule[1];
        p->startDayOfWeek = rule[2];
        p->startTime      = rule[3];
        p->startTimeMode  = rule[4];
        p->endMonth       = rule[5];
        p->endDay         = rule[6];
        p->endDayOfWeek   = rule[7];
        p->endTime        = rule[8];
        p->endTimeMode    = rule[9];
        p->finalDstOffset = rule[10];

        ures_getByKey(zone_res.get(), "finalRaw", res.get(), &status);
        p->finalRawOffset = ures_getInt(res.get(), &status);

        if (U_FAILURE(status)) {
            return nullptr;
        }
    }

    db->zone_data[idx] = std::move(p);
    return db->zone_data[idx].get();
}

struct my_sys_info {
    std::chrono::sys_seconds begin;
    std::chrono::sys_seconds end;
    std::chrono::seconds offset;
    std::chrono::minutes save;
};

constexpr auto compute_trans_date(std::chrono::year y, int32_t m, int32_t d, int32_t day_of_week) {
    using std::chrono::sys_days, std::chrono::days, std::chrono::weekday, std::chrono::last, std::chrono::operator/;
    if (day_of_week < 0) {
        day_of_week = -day_of_week;
        if (d < 0) {
            const sys_days bound = y/(m + 1)/-d;
            return bound - (weekday{bound} - weekday(day_of_week - 1));
        } else {
            const sys_days bound = y/(m + 1)/d;
            return bound + (weekday(day_of_week - 1) - weekday{bound});
        }
    } else if (day_of_week == 0) {
        return sys_days{y/(m + 1)/d};
    } else {
        if (d > 0) {
            return sys_days{y/(m + 1)/weekday(day_of_week - 1)[d]};
        } else {
            const sys_days date = sys_days{y/(m + 1)/weekday(day_of_week - 1)[last]};
            return date - days((-d + 1) * 7);
        }
    }
}

enum : int32_t {
    WALL_TIME = 0,
    STANDARD_TIME = 1,
    UTC_TIME = 2,
};

auto compute_dst_start(std::chrono::year y, const time_zone_data& tz) {
    const auto date = compute_trans_date(y, tz.startMonth, tz.startDay, tz.startDayOfWeek);
    auto timestamp = date.time_since_epoch().count() * 86400 + tz.startTime;

    if (tz.startTimeMode != UTC_TIME) {
        timestamp -= tz.finalRawOffset;
    }

    return timestamp;
}

auto compute_dst_end(std::chrono::year y, const time_zone_data& tz) {
    const auto date = compute_trans_date(y, tz.endMonth, tz.endDay, tz.endDayOfWeek);
    auto timestamp = date.time_since_epoch().count() * 86400 + tz.endTime;

    if (tz.endTimeMode == WALL_TIME) {
        timestamp -= tz.finalRawOffset + tz.finalDstOffset;
    } else if (tz.endTimeMode == STANDARD_TIME) {
        timestamp -= tz.finalRawOffset;
    }

    return timestamp;
}

my_sys_info my_get_sys_info(const std::string_view tz_name, const long long seconds_from_epoch) {
    UErrorCode status = U_ZERO_ERROR;
    const auto db = get_tzdb_data(status);
    const auto tz = get_time_zone_data(db, tz_name, status);

    constexpr auto to_int64 = [](auto r) {
        return static_cast<int64_t>(static_cast<uint32_t>(r[0])) << 32 | static_cast<int64_t>(static_cast<uint32_t>(r[1]));
    };

    const auto transPre32_view = tz->transPre32 | std::views::chunk(2) | std::views::transform(to_int64);
    const auto transPost32_view = tz->transPost32 | std::views::chunk(2) | std::views::transform(to_int64);

    int64_t begin = INT64_MIN;
    int64_t end = INT64_MAX;

    int32_t idx = 0;
    if (seconds_from_epoch < INT32_MIN) {
        const auto pos = std::ranges::upper_bound(transPre32_view, seconds_from_epoch);
        idx = pos - transPre32_view.begin();

        if (pos != transPre32_view.end()) {
            end = *pos;
        } else if (!tz->trans.empty()) {
            end = tz->trans.front();
        } else if (!transPost32_view.empty()) {
            end = transPost32_view.front();
        }

        if (pos != transPre32_view.begin()) {
            begin = transPre32_view[pos - transPre32_view.begin() - 1];
        }
    } else if (seconds_from_epoch > INT32_MAX) {
        const auto pos = std::ranges::upper_bound(transPost32_view, seconds_from_epoch);
        idx = transPre32_view.size() + tz->trans.size() + (pos - transPost32_view.begin());

        if (pos != transPost32_view.end()) {
            end = *pos;
        }

        if (pos != transPost32_view.begin()) {
            begin = transPost32_view[pos - transPost32_view.begin() - 1];
        } else if (!tz->trans.empty()) {
            begin = tz->trans.back();
        } else if (!transPre32_view.empty()) {
            begin = transPre32_view.back();
        }
    } else {
        const auto pos = std::ranges::upper_bound(tz->trans, seconds_from_epoch);
        idx = transPre32_view.size() + (pos - tz->trans.begin());

        if (pos != tz->trans.end()) {
            end = *pos;
        } else if (!transPost32_view.empty()) {
            end = transPost32_view.front();
        }

        if (pos != tz->trans.begin()) {
            begin = tz->trans[pos - tz->trans.begin() - 1];
        } else if (!transPre32_view.empty()) {
            begin = transPre32_view.back();
        }
    }

    constexpr std::chrono::sys_seconds min_tp{std::chrono::sys_days{std::chrono::year::min()/1/1}};
    constexpr auto max_tp =
        std::chrono::sys_seconds{std::chrono::sys_days{std::chrono::year::max()/12/32}} - std::chrono::seconds{1};

    if (idx >= tz->typeMap.size() && tz->finalDstOffset != 0) {
        if (!transPost32_view.empty()) {
            begin = transPost32_view.back();
        } else if (!tz->trans.empty()) {
            begin = tz->trans.back();
        } else if (!transPre32_view.empty()) {
            begin = transPre32_view.back();
        }

        const std::chrono::sys_seconds tp{std::chrono::seconds{seconds_from_epoch}};
        const std::chrono::year_month_day ymd{std::chrono::floor<std::chrono::days>(tp)};
        const auto y = ymd.year();

        const bool southern = tz->startMonth > tz->endMonth;
        bool is_dst;

        const auto first_yearly_transition = southern ? compute_dst_end(y, *tz) : compute_dst_start(y, *tz);

        constexpr std::chrono::years one_year{1};

        if (seconds_from_epoch < first_yearly_transition) {
            const auto prev_transition = southern ?
                compute_dst_start(y - one_year, *tz) : compute_dst_end(y - one_year, *tz);
            if (begin < prev_transition) {
                begin = prev_transition;
            }

            end = first_yearly_transition;
            is_dst = southern;
        } else {
            const auto second_yearly_transition =
                southern ? compute_dst_start(y, *tz) : compute_dst_end(y, *tz);
            if (seconds_from_epoch < second_yearly_transition) {
                begin = first_yearly_transition;
                end = second_yearly_transition;
                is_dst = !southern;
            } else {
                const auto next_transition = southern ?
                    compute_dst_end(y + one_year, *tz) : compute_dst_start(y + one_year, *tz);
                begin = second_yearly_transition;
                end = next_transition;
                is_dst = southern;
            }
        }

        const int32_t std_offset = tz->finalRawOffset;
        const int32_t save = is_dst ? tz->finalDstOffset : 0;
        return {
            begin == INT64_MIN ? min_tp : std::chrono::sys_seconds{std::chrono::seconds{begin}},
            end == INT64_MAX ? max_tp : std::chrono::sys_seconds{std::chrono::seconds{end}},
            std::chrono::seconds{std_offset + save},
            std::chrono::minutes{save / 60}
        };
    } else {
        const auto off = idx == 0 ? 0 : tz->typeMap[idx - 1];
        assert(off < tz->typeOffsets.size() / 2);
        const auto std_offset = tz->typeOffsets[off * 2];
        const auto save = tz->typeOffsets[off * 2 + 1];

        return {
            begin == INT64_MIN ? min_tp : std::chrono::sys_seconds{std::chrono::seconds{begin}},
            end == INT64_MAX ? max_tp : std::chrono::sys_seconds{std::chrono::seconds{end}},
            std::chrono::seconds{std_offset + save},
            std::chrono::minutes{save / 60}
        };
    }
}

int main() {
    using namespace std::chrono;
    const std::string_view tz_names[] = {
        "Asia/Shanghai",
    };

    const sys_seconds test_tp[] = {
        sys_days{2026y/April/5} + 3h,
    };

    for (const std::string_view tz_name : tz_names) {
        std::println("# {}", tz_name);

        for (const sys_seconds tp : test_tp) {
            std::println("## {}", tp);

            const auto t = my_get_sys_info(tz_name, tp.time_since_epoch().count());
            const auto& u = locate_zone(tz_name)->get_info(tp);

            std::println("{} {} {} {}", t.begin, t.end, t.offset, t.save);
            std::println("{} {} {} {}", u.begin, u.end, u.offset, u.save);
        }
    }
}

Relevant info:

  • The time zone data are stored in the zoneinfo64 resource.
  • ICU's olsontz.h describes how the data are stored, and simpletz.cpp describes how to decode a recurrent rule.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions