From d0ff60b1cd3e0749953fb28ad21eb1fb0ed2b0a0 Mon Sep 17 00:00:00 2001 From: Alvaro Huarte Date: Sat, 14 Feb 2026 19:49:59 +0100 Subject: [PATCH 1/2] Build extension for WASM --- .../workflows/MainDistributionPipeline.yml | 2 +- CMakeLists.txt | 9 ++++++ extension-ci-tools | 2 +- src/eurostat/eurostat.cpp | 1 - src/eurostat/eurostat_data_functions.cpp | 13 --------- src/eurostat/eurostat_info_functions.cpp | 27 ------------------ src/eurostat/eurostat_scalar_functions.cpp | 3 -- src/eurostat/filter_encoder.cpp | 28 ------------------- src/eurostat/filter_encoder.hpp | 2 -- src/eurostat/http_request.cpp | 9 +++--- src/eurostat/http_request.hpp | 1 - src/eurostat/xml_element.cpp | 5 ---- src/eurostat/xml_element.hpp | 1 - 13 files changed, 15 insertions(+), 88 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 9c0145d..6b79aea 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -19,7 +19,7 @@ jobs: duckdb_version: v1.4.4 ci_tools_version: v1.4-andium extension_name: eurostat - exclude_archs: 'windows_amd64_mingw;wasm_mvp;wasm_eh;wasm_threads' + exclude_archs: 'windows_amd64_mingw' code-quality-check: name: Code Quality Check diff --git a/CMakeLists.txt b/CMakeLists.txt index fb6745e..924baac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,15 @@ find_package(LibXml2 REQUIRED) set(EXTENSION_NAME ${TARGET_NAME}_extension) set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension) +if(EMSCRIPTEN) + # _LINKED_LIBS influences only Wasm compilation it's unclear why this is + # needed, but somehow some global symbols are not properly + # exported otherwise this solves by basically re-linking (at the moment the + # Wasm binary is actually produced) + set(DUCKDB_EXTENSION_EUROSTAT_LINKED_LIBS + "../../vcpkg_installed/wasm32-emscripten/lib/lib*.a") +endif() + project(${TARGET_NAME}) include_directories(src/include) include_directories(${CMAKE_SOURCE_DIR}/third_party/httplib) diff --git a/extension-ci-tools b/extension-ci-tools index af154d2..32eb753 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit af154d26927bc8bdc9be4b3c58cde1c00d196388 +Subproject commit 32eb753d9b660bf90bdca42652cf40c1ef64bf67 diff --git a/src/eurostat/eurostat.cpp b/src/eurostat/eurostat.cpp index b13004a..11e6ed7 100644 --- a/src/eurostat/eurostat.cpp +++ b/src/eurostat/eurostat.cpp @@ -5,7 +5,6 @@ namespace eurostat { //! Get the level for a GEO code in the NUTS classification or if it is considered aggregates. std::string Dimension::GetGeoLevelFromGeoCode(const std::string &geo_code) { - #define STARTS_WITH(str, prefix) ((strlen(prefix) <= str.size()) && strncmp(str.c_str(), prefix, strlen(prefix)) == 0) // diff --git a/src/eurostat/eurostat_data_functions.cpp b/src/eurostat/eurostat_data_functions.cpp index 4af862e..6bbb207 100644 --- a/src/eurostat/eurostat_data_functions.cpp +++ b/src/eurostat/eurostat_data_functions.cpp @@ -45,7 +45,6 @@ namespace { //====================================================================================================================== struct ES_Read { - //------------------------------------------------------------------------------------------------------------------ // Bind //------------------------------------------------------------------------------------------------------------------ @@ -64,7 +63,6 @@ struct ES_Read { static unique_ptr Bind(ClientContext &context, TableFunctionBindInput &input, vector &return_types, vector &names) { - D_ASSERT(input.inputs.size() == 2); const string &provider_id = StringValue::Get(input.inputs[0]); @@ -127,7 +125,6 @@ struct ES_Read { //! Parse a data row from a TSV line. static bool ParseDatarow(State &data_table, const std::vector &time_periods, int32_t geo_column_index, const string &line, std::unordered_map &row_keys, const bool &check_keys) { - std::vector state_keys; // Split line by tabs. @@ -182,7 +179,6 @@ struct ES_Read { // Parse observation values for each time period. for (size_t i = 0; i < time_periods.size(); i++) { - // Duplicate row, skip. if (check_keys && state_keys[i]) { @@ -214,7 +210,6 @@ struct ES_Read { static std::vector GetDataUrls(ClientContext &context, TableFunctionInitInput &input, const std::vector &data_structure, const string &base_url, const BindData &bind_data) { - std::unordered_map urls; // Use complex filters previously parsed in 'PushdownComplexFilter' function. @@ -246,7 +241,6 @@ struct ES_Read { } static unique_ptr Init(ClientContext &context, TableFunctionInitInput &input) { - auto &bind_data = input.bind_data->Cast(); auto global_state = make_uniq_base(); auto &data_table = global_state->Cast(); @@ -268,7 +262,6 @@ struct ES_Read { bool check_keys = data_urls.size() > 1; for (const auto &data_url : data_urls) { - EUROSTAT_SCAN_DEBUG_LOG(1, "Fetching data from URL: %s", data_url.c_str()); // Execute HTTP GET request. @@ -311,7 +304,6 @@ struct ES_Read { while (std::getline(line_stream, line)) { if (!line.empty()) { - // Parse header line to... if (line_index == 0) { size_t pos = line.find("\\TIME_PERIOD"); @@ -368,7 +360,6 @@ struct ES_Read { //------------------------------------------------------------------------------------------------------------------ static void Execute(ClientContext &context, TableFunctionInput &input, DataChunk &output) { - auto &gstate = input.global_state->Cast(); // Calculate how many record we can fit in the output @@ -382,7 +373,6 @@ struct ES_Read { // Load current subset of rows. for (idx_t row_idx = 0, record_idx = current_row; row_idx < output_size; row_idx++, record_idx++) { - const auto &datarow = gstate.rows[record_idx]; const DimensionValues &dim_values = gstate.dimensions[datarow.dimension_index]; const auto &dim_count = dim_values.values.size(); @@ -439,7 +429,6 @@ struct ES_Read { static void PushdownComplexFilter(ClientContext &context, LogicalGet &get, FunctionData *bind_data_p, vector> &expressions) { - auto &bind_data = bind_data_p->Cast(); // Get column_ids from LogicalGet to map expression column indices to table columns. @@ -473,7 +462,6 @@ struct ES_Read { //------------------------------------------------------------------------------------------------------------------ static void Register(ExtensionLoader &loader) { - InsertionOrderPreservingMap tags; tags.insert("ext", "eurostat"); tags.insert("category", "table"); @@ -499,7 +487,6 @@ struct ES_Read { // ##################################################################################################################### void EurostatDataFunctions::Register(ExtensionLoader &loader) { - ES_Read::Register(loader); } diff --git a/src/eurostat/eurostat_info_functions.cpp b/src/eurostat/eurostat_info_functions.cpp index 1b7fe38..8e5c69b 100644 --- a/src/eurostat/eurostat_info_functions.cpp +++ b/src/eurostat/eurostat_info_functions.cpp @@ -29,7 +29,6 @@ namespace { //====================================================================================================================== struct ES_Endpoints { - //------------------------------------------------------------------------------------------------------------------ // Bind //------------------------------------------------------------------------------------------------------------------ @@ -42,7 +41,6 @@ struct ES_Endpoints { static unique_ptr Bind(ClientContext &context, TableFunctionBindInput &input, vector &return_types, vector &names) { - names.emplace_back("provider_id"); return_types.push_back(LogicalType::VARCHAR); names.emplace_back("organization"); @@ -122,7 +120,6 @@ struct ES_Endpoints { //------------------------------------------------------------------------------------------------------------------ static void Register(ExtensionLoader &loader) { - InsertionOrderPreservingMap tags; tags.insert("ext", "eurostat"); tags.insert("category", "table"); @@ -137,7 +134,6 @@ struct ES_Endpoints { //====================================================================================================================== struct ES_Dataflows { - #define INFO_COLUMN_PROVIDER_ID 0 #define INFO_COLUMN_DATAFLOW_ID 1 #define INFO_COLUMN_TYPE 2 @@ -174,7 +170,6 @@ struct ES_Dataflows { //! Parse DataflowInfo from JSON object inline static DataflowInfo ParseDataflow(const string &provider_id, yyjson_val *object_val, bool &load_datastructure, bool &load_annotations) { - yyjson_val *extension_val = nullptr; yyjson_val *annotation_val = nullptr; yyjson_val *attrib_val = nullptr; @@ -300,7 +295,6 @@ struct ES_Dataflows { static unique_ptr Bind(ClientContext &context, TableFunctionBindInput &input, vector &return_types, vector &names) { - std::vector providers; std::vector dataflows; string language = "en"; @@ -417,7 +411,6 @@ struct ES_Dataflows { }; static unique_ptr Init(ClientContext &context, TableFunctionInitInput &input) { - const auto &bind_data = input.bind_data->Cast(); const auto &providers = bind_data.providers; const auto &dataflows = bind_data.dataflows; @@ -519,7 +512,6 @@ struct ES_Dataflows { //------------------------------------------------------------------------------------------------------------------ static void Execute(ClientContext &context, TableFunctionInput &input, DataChunk &output) { - auto &gstate = input.global_state->Cast(); // Calculate how many record we can fit in the output @@ -659,7 +651,6 @@ struct ES_Dataflows { //------------------------------------------------------------------------------------------------------------------ static void Register(ExtensionLoader &loader) { - InsertionOrderPreservingMap tags; tags.insert("ext", "eurostat"); tags.insert("category", "table"); @@ -696,7 +687,6 @@ static constexpr const char *ES_VALUES_PATH = static constexpr const char *ES_ERROR_PATH = "/S:Fault/faultstring"; struct ES_DataStructure { - //! Information of a Dimension of an EUROSTAT Dataflow struct Dimension { int32_t position = -1; @@ -709,7 +699,6 @@ struct ES_DataStructure { //! Returns the basic data structure of an EUROSTAT Dataflow. static std::vector GetBasicDataSchema(ClientContext &context, const string &provider_id, const string &dataflow_id, const string &language) { - std::vector dimensions; // Execute HTTP GET request @@ -738,9 +727,7 @@ struct ES_DataStructure { xmlXPathObjectPtr xpath_obj = nullptr; for (const auto &xpath : {ES_DIMENSION_PATH, ES_TIME_DIMENSION_PATH}) { - if ((xpath_obj = xmlXPathEvalExpression(BAD_CAST xpath, xpath_ctx)) && xpath_obj->nodesetval) { - for (int i = 0; i < xpath_obj->nodesetval->nodeNr; i++) { xmlNodePtr node = xpath_obj->nodesetval->nodeTab[i]; @@ -790,7 +777,6 @@ struct ES_DataStructure { // Get the Concept names for Dimensions if ((xpath_obj = xmlXPathEvalExpression(BAD_CAST ES_CONCEPT_PATH, xpath_ctx)) && xpath_obj->nodesetval) { - for (int i = 0; i < xpath_obj->nodesetval->nodeNr; i++) { xmlNodePtr node = xpath_obj->nodesetval->nodeTab[i]; @@ -798,9 +784,7 @@ struct ES_DataStructure { if (!concept_id.empty()) { for (auto &dim : dimensions) { if (dim.concept_id == concept_id) { - for (xmlNodePtr child = node->children; child; child = child->next) { - if (strcmp((const char *)child->name, "Name") == 0) { string lang = XmlUtils::GetNodeAttributeValue(child, "lang", language); @@ -826,7 +810,6 @@ struct ES_DataStructure { //! Returns the data structure of an EUROSTAT Dataflow. static std::vector GetDataSchema(ClientContext &context, const string &provider_id, const string &dataflow_id, const string &language) { - auto dimensions = ES_DataStructure::GetBasicDataSchema(context, provider_id, dataflow_id, language); // Execute HTTP GET request @@ -853,7 +836,6 @@ struct ES_DataStructure { xmlXPathObjectPtr xpath_obj = nullptr; if ((xpath_obj = xmlXPathEvalExpression(BAD_CAST ES_VALUES_PATH, xpath_ctx)) && xpath_obj->nodesetval) { - for (int i = 0; i < xpath_obj->nodesetval->nodeNr; i++) { xmlNodePtr node = xpath_obj->nodesetval->nodeTab[i]; @@ -863,7 +845,6 @@ struct ES_DataStructure { for (auto &dim : dimensions) { if (dim.id == dim_id) { - for (xmlNodePtr child = node->children; child; child = child->next) { if (strcmp((const char *)child->name, "Value") == 0) { string code_value = XmlUtils::GetNodeTextContent(child); @@ -900,7 +881,6 @@ struct ES_DataStructure { static unique_ptr Bind(ClientContext &context, TableFunctionBindInput &input, vector &return_types, vector &names) { - D_ASSERT(input.inputs.size() == 2); const string provider_id = StringValue::Get(input.inputs[0]); @@ -975,7 +955,6 @@ struct ES_DataStructure { //------------------------------------------------------------------------------------------------------------------ static unique_ptr Cardinality(ClientContext &context, const FunctionData *data) { - auto &bind_data = data->Cast(); auto result = make_uniq(); @@ -991,7 +970,6 @@ struct ES_DataStructure { //------------------------------------------------------------------------------------------------------------------ static void Execute(ClientContext &context, TableFunctionInput &input, DataChunk &output) { - auto &bind_data = input.bind_data->Cast(); auto &gstate = input.global_state->Cast(); @@ -1086,7 +1064,6 @@ struct ES_DataStructure { //------------------------------------------------------------------------------------------------------------------ static void Register(ExtensionLoader &loader) { - InsertionOrderPreservingMap tags; tags.insert("ext", "eurostat"); tags.insert("category", "table"); @@ -1109,7 +1086,6 @@ struct ES_DataStructure { //! Returns the data structure (dimensions) of a given dataflow std::vector EurostatUtils::DataStructureOf(ClientContext &context, const std::string &provider_id, const std::string &dataflow_id) { - auto dimensions = ES_DataStructure::GetBasicDataSchema(context, provider_id, dataflow_id, "en"); std::vector data_structure; @@ -1122,7 +1098,6 @@ std::vector EurostatUtils::DataStructureOf(ClientContext &c //! Extracts the error message of a given Eurostat API response body std::string EurostatUtils::GetXmlErrorMessage(const std::string &response_body) { - XmlDocument document = XmlDocument(response_body); xmlXPathContextPtr xpath_ctx = document.GetXPathContext(); xmlXPathObjectPtr xpath_obj = nullptr; @@ -1131,7 +1106,6 @@ std::string EurostatUtils::GetXmlErrorMessage(const std::string &response_body) if ((xpath_obj = xmlXPathEvalExpression(BAD_CAST ES_ERROR_PATH, xpath_ctx)) && xpath_obj->nodesetval && xpath_obj->nodesetval->nodeNr > 0) { - xmlNodePtr node = xpath_obj->nodesetval->nodeTab[0]; error_msg = XmlUtils::GetNodeTextContent(node); } @@ -1144,7 +1118,6 @@ std::string EurostatUtils::GetXmlErrorMessage(const std::string &response_body) } void EurostatInfoFunctions::Register(ExtensionLoader &loader) { - ES_Endpoints::Register(loader); ES_Dataflows::Register(loader); ES_DataStructure::Register(loader); diff --git a/src/eurostat/eurostat_scalar_functions.cpp b/src/eurostat/eurostat_scalar_functions.cpp index f39516a..79bfc25 100644 --- a/src/eurostat/eurostat_scalar_functions.cpp +++ b/src/eurostat/eurostat_scalar_functions.cpp @@ -18,7 +18,6 @@ namespace { //====================================================================================================================== struct ES_GeoLevel { - //! Returns the level for a GEO code in the NUTS classification or if it is considered aggregates. inline static void GetGeoLevelFromGeoCode(DataChunk &args, ExpressionState &state, Vector &result) { D_ASSERT(args.data.size() == 1); @@ -52,7 +51,6 @@ struct ES_GeoLevel { //------------------------------------------------------------------------------------------------------------------ static void Register(ExtensionLoader &loader) { - InsertionOrderPreservingMap tags; tags.insert("ext", "eurostat"); tags.insert("category", "scalar"); @@ -71,7 +69,6 @@ struct ES_GeoLevel { // ##################################################################################################################### void EurostatScalarFunctions::Register(ExtensionLoader &loader) { - ES_GeoLevel::Register(loader); } diff --git a/src/eurostat/filter_encoder.cpp b/src/eurostat/filter_encoder.cpp index 42ca378..92cd2fa 100644 --- a/src/eurostat/filter_encoder.cpp +++ b/src/eurostat/filter_encoder.cpp @@ -40,7 +40,6 @@ static std::string VIRTUAL_DIMENSION_FLAG = "---"; static std::string TIME_PERIOD_DIMENSION_NAME = "time_period"; EurostatFilter::EurostatFilter(const std::vector &ds) : data_structure(ds) { - for (const auto &dim : data_structure) { if (dim.position == -1 || dim.name == TIME_PERIOD_DIMENSION_NAME) { dim_mask.push_back(VIRTUAL_DIMENSION_FLAG); @@ -51,7 +50,6 @@ EurostatFilter::EurostatFilter(const std::vector &ds) : dat } bool EurostatFilter::IsEmpty() const { - if (!start_period.empty() || !end_period.empty()) { return false; } @@ -103,7 +101,6 @@ std::string EurostatFilter::GetFilterString() const { } void EurostatFilterSet::PushEmptyFilter() { - EurostatFilter new_filter(data_structure); filters.emplace_back(std::move(new_filter)); } @@ -124,9 +121,7 @@ bool FilterEncoder::GetComparisonOperator(ExpressionType type, std::string &out_ bool FilterEncoder::EncodeFilter(const TableFilter &filter, const eurostat::Dimension &dimension, const idx_t &dim_index, EurostatFilterSet &out_result) { - switch (filter.filter_type) { - case TableFilterType::CONSTANT_COMPARISON: return EncodeConstantComparison(filter.Cast(), dimension, dim_index, out_result); @@ -155,7 +150,6 @@ bool FilterEncoder::EncodeFilter(const TableFilter &filter, const eurostat::Dime bool FilterEncoder::EncodeConstantComparison(const ConstantFilter &filter, const eurostat::Dimension &dimension, const idx_t &dim_index, EurostatFilterSet &out_result) { - if (filter.constant.IsNull()) { out_result.supported = false; return false; @@ -164,7 +158,6 @@ bool FilterEncoder::EncodeConstantComparison(const ConstantFilter &filter, const auto &out_filter = out_result.GetCurrentFilter(); if (dimension.name == TIME_PERIOD_DIMENSION_NAME) { - if (filter.comparison_type == ExpressionType::COMPARE_GREATERTHANOREQUALTO) { out_filter.start_period = "startPeriod=" + filter.constant.ToString(); return true; @@ -198,7 +191,6 @@ bool FilterEncoder::EncodeConstantComparison(const ConstantFilter &filter, const bool FilterEncoder::EncodeInFilter(const InFilter &filter, const eurostat::Dimension &dimension, const idx_t &dim_index, EurostatFilterSet &out_result) { - if (dimension.name == TIME_PERIOD_DIMENSION_NAME && filter.values.size() > 1) { // Eurostat API does not support multiple time period values. out_result.supported = false; @@ -217,7 +209,6 @@ bool FilterEncoder::EncodeInFilter(const InFilter &filter, const eurostat::Dimen bool FilterEncoder::EncodeConjunctionAnd(const ConjunctionAndFilter &filter, const eurostat::Dimension &dimension, const idx_t &dim_index, EurostatFilterSet &out_result) { - // Something's wrong if there are no child filters. if (filter.child_filters.empty()) { out_result.supported = false; @@ -235,7 +226,6 @@ bool FilterEncoder::EncodeConjunctionAnd(const ConjunctionAndFilter &filter, con bool FilterEncoder::EncodeConjunctionOr(const ConjunctionOrFilter &filter, const eurostat::Dimension &dimension, const idx_t &dim_index, EurostatFilterSet &out_result) { - // Something's wrong if there are no child filters. if (filter.child_filters.empty()) { out_result.supported = false; @@ -255,7 +245,6 @@ bool FilterEncoder::EncodeConjunctionOr(const ConjunctionOrFilter &filter, const FilterEncoderResult FilterEncoder::Encode(const TableFilterSet *filters, const std::vector &data_structure, const std::vector &column_ids) { - FilterEncoderResult result; result.supported = true; @@ -319,7 +308,6 @@ FilterEncoderResult FilterEncoder::Encode(const TableFilterSet *filters, if (result.supported) { for (const auto &out_filter : filter_set.filters) { - if (!out_filter.IsEmpty()) { std::string filter_clause = out_filter.GetFilterString(); result.filters.emplace_back(filter_clause); @@ -337,7 +325,6 @@ FilterEncoderResult FilterEncoder::Encode(const TableFilterSet *filters, FilterEncoderResult FilterEncoder::EncodeExpression(vector> &expressions, const std::vector &data_structure, const std::vector &column_ids) { - FilterEncoderResult result; result.supported = true; @@ -362,7 +349,6 @@ FilterEncoderResult FilterEncoder::EncodeExpression(vector &data_structure, const std::vector &column_ids) { - if (expr.GetExpressionClass() != ExpressionClass::BOUND_COLUMN_REF) { return -1; } @@ -401,18 +386,14 @@ int FilterEncoder::GetDimensionIndexFromColumnRef(const Expression &expr, bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vector &data_structure, const std::vector &column_ids, EurostatFilterSet &out_result) { - switch (expr.GetExpressionClass()) { - case ExpressionClass::BOUND_COMPARISON: { - const auto &op = expr.Cast(); // Handle column = constant comparisons. if (op.left->GetExpressionClass() == ExpressionClass::BOUND_COLUMN_REF && op.right->GetExpressionClass() == ExpressionClass::BOUND_CONSTANT) { - int dim_idx = GetDimensionIndexFromColumnRef(*op.left, data_structure, column_ids); if (dim_idx != -1) { const auto &dimension = data_structure[dim_idx]; @@ -428,7 +409,6 @@ bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vect return false; } case ExpressionClass::BOUND_OPERATOR: { - const auto &op = expr.Cast(); // Handle column IN (values) comparisons. @@ -453,7 +433,6 @@ bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vect vector values; for (idx_t child = 1; child < op.children.size(); child++) { - if (op.children[child]->GetExpressionClass() != ExpressionClass::BOUND_CONSTANT) { out_result.supported = false; return false; @@ -473,11 +452,9 @@ bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vect return false; } case ExpressionClass::BOUND_CONJUNCTION: { - const auto &conjunction = expr.Cast(); if (conjunction.type == ExpressionType::CONJUNCTION_AND) { - // For AND, all children must be supported. for (const auto &child : conjunction.children) { if (!EncodeExpressionNode(*child, data_structure, column_ids, out_result)) { @@ -488,10 +465,8 @@ bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vect return true; } if (conjunction.type == ExpressionType::CONJUNCTION_OR) { - // For OR, create separate dimension filter for each child. for (const auto &child : conjunction.children) { - if (!EncodeExpressionNode(*child, data_structure, column_ids, out_result)) { out_result.supported = false; return false; @@ -505,7 +480,6 @@ bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vect return false; } case ExpressionClass::BOUND_BETWEEN: { - auto &op = expr.Cast(); // Handle column BETWEEN constant AND constant comparisons. @@ -519,12 +493,10 @@ bool FilterEncoder::EncodeExpressionNode(const Expression &expr, const std::vect int dim_idx = GetDimensionIndexFromColumnRef(*op.input, data_structure, column_ids); if (dim_idx != -1) { - const auto &dimension = data_structure[dim_idx]; // Only supported for "timePeriod", the lower bound as startPeriod and upper bound as endPeriod. if (dimension.name == TIME_PERIOD_DIMENSION_NAME) { - const auto &lower_const = op.lower->Cast(); const auto &upper_const = op.upper->Cast(); diff --git a/src/eurostat/filter_encoder.hpp b/src/eurostat/filter_encoder.hpp index c8355ca..cb72048 100644 --- a/src/eurostat/filter_encoder.hpp +++ b/src/eurostat/filter_encoder.hpp @@ -11,7 +11,6 @@ namespace duckdb { * Result of encoding a single T-SQL expression or filter to an Eurostat filter. */ struct EurostatFilter { - //! Reference to a data structure of dimensions. const std::vector &data_structure; @@ -36,7 +35,6 @@ struct EurostatFilter { * Result of encoding an entire T-SQL filter to a set of Eurostat filter clauses. */ struct EurostatFilterSet { - //! Reference to a data structure of dimensions. const std::vector &data_structure; diff --git a/src/eurostat/http_request.cpp b/src/eurostat/http_request.cpp index 1121488..08c544d 100644 --- a/src/eurostat/http_request.cpp +++ b/src/eurostat/http_request.cpp @@ -6,6 +6,7 @@ #include "duckdb/common/string_util.hpp" #include "duckdb/main/client_context.hpp" #include "duckdb/main/client_context_file_opener.hpp" +#include "duckdb/main/settings.hpp" #include "zstd.h" namespace duckdb { @@ -187,7 +188,6 @@ static string NormalizeHeaderName(const string &name) { // Extract HTTP settings from context (call from main thread) HttpSettings HttpRequest::ExtractHttpSettings(ClientContext &context, const string &url) { - HttpSettings settings; auto &db = DatabaseInstance::GetDatabase(context); auto &config = db.config; @@ -208,9 +208,9 @@ HttpSettings HttpRequest::ExtractHttpSettings(ClientContext &context, const stri FileOpener::TryGetCurrentSetting(&opener, "http_request_cache", settings.use_cache, &info); FileOpener::TryGetCurrentSetting(&opener, "http_follow_redirects", settings.follow_redirects, &info); - settings.proxy = config.options.http_proxy; - settings.proxy_username = config.options.http_proxy_username; - settings.proxy_password = config.options.http_proxy_password; + settings.proxy = HTTPProxySetting::GetSetting(context).ToString(); + settings.proxy_username = HTTPProxyUsernameSetting::GetSetting(context).ToString(); + settings.proxy_password = HTTPProxyPasswordSetting::GetSetting(context).ToString(); KeyValueSecretReader secret_reader(opener, &info, "http"); string proxy_from_secret; @@ -236,7 +236,6 @@ HttpSettings HttpRequest::ExtractHttpSettings(ClientContext &context, const stri HttpResponseData HttpRequest::ExecuteHttpRequest(const HttpSettings &settings, const string &url, const string &method, const duckdb_httplib_openssl::Headers &headers, const string &request_body, const string &content_type) { - HttpResponseData result; result.status_code = 0; result.content_length = -1; diff --git a/src/eurostat/http_request.hpp b/src/eurostat/http_request.hpp index 203af79..9f68fe5 100644 --- a/src/eurostat/http_request.hpp +++ b/src/eurostat/http_request.hpp @@ -40,7 +40,6 @@ struct HttpResponseData { //! Represents an HTTP request struct HttpRequest { - // Extract HTTP settings from context static HttpSettings ExtractHttpSettings(ClientContext &context, const string &url); diff --git a/src/eurostat/xml_element.cpp b/src/eurostat/xml_element.cpp index c8f0733..42a048d 100644 --- a/src/eurostat/xml_element.cpp +++ b/src/eurostat/xml_element.cpp @@ -10,7 +10,6 @@ static void XMLSilentXPathErrorHandler(void *ctx, const xmlError *error) { } XmlDocument::XmlDocument(const std::string &xml_str) : doc(nullptr), xpath_ctx(nullptr) { - // Parse the XML with options to suppress error messages (thread-safe, per-operation config) // XML_PARSE_NOERROR: suppress error reports to stderr // XML_PARSE_NOWARNING: suppress warning reports to stderr @@ -45,7 +44,6 @@ XmlDocument::XmlDocument(const std::string &xml_str) : doc(nullptr), xpath_ctx(n } XmlDocument::~XmlDocument() { - // Reset and free the XML resources if (xpath_ctx) { xmlSetStructuredErrorFunc(xpath_ctx, nullptr); @@ -60,7 +58,6 @@ XmlDocument::~XmlDocument() { //! Register all namespace declarations from the document into the XPath context void XmlDocument::RegisterNamespaces(xmlXPathContextPtr xpath_ctx) { - // This enables XPath expressions like "//gml:posList" to work when xmlns:gml="..." is declared. // Without this, libxml2's XPath engine requires manual registration of each namespace prefix. // See: https://grantm.github.io/perl-libxml-by-example/namespaces.html @@ -135,7 +132,6 @@ XmlElement XmlUtils::ProcessNode(xmlNodePtr node) { // Set text content (for element nodes, get direct text content only) if (node->type == XML_ELEMENT_NODE) { - // Get only direct text children, not all descendants for (xmlNodePtr child = node->children; child; child = child->next) { if (child->type == XML_TEXT_NODE && child->content) { @@ -152,7 +148,6 @@ XmlElement XmlUtils::ProcessNode(xmlNodePtr node) { // Set attributes for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) { - if (attr->name && attr->children && attr->children->content) { std::string attr_name((const char *)attr->name); std::string attr_value((const char *)attr->children->content); diff --git a/src/eurostat/xml_element.hpp b/src/eurostat/xml_element.hpp index 4559ac4..16f7e1c 100644 --- a/src/eurostat/xml_element.hpp +++ b/src/eurostat/xml_element.hpp @@ -54,7 +54,6 @@ class XmlDocument { //! Utility functions for XML processing struct XmlUtils { - //! Initialize libxml2 (call once at extension load) static void Initialize(); //! Cleanup libxml2 (optional, for clean shutdown) From 99a4d98217ee92592c6db002137bd4881d8d6192 Mon Sep 17 00:00:00 2001 From: Alvaro Huarte Date: Wed, 25 Feb 2026 02:54:08 +0100 Subject: [PATCH 2/2] Fix extension from WASM --- CMakeLists.txt | 4 +- src/eurostat/eurostat_data_functions.cpp | 3 +- src/eurostat/eurostat_info_functions.cpp | 9 +-- src/eurostat/http_request.cpp | 88 +++++++++++++++++++++++- src/eurostat/http_request.hpp | 9 ++- 5 files changed, 99 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 924baac..3a257ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,9 @@ endif() project(${TARGET_NAME}) include_directories(src/include) -include_directories(${CMAKE_SOURCE_DIR}/third_party/httplib) +if(NOT EMSCRIPTEN) + include_directories(${CMAKE_SOURCE_DIR}/third_party/httplib) +endif() set(EXTENSION_SOURCES src/eurostat_extension.cpp) add_subdirectory(src/eurostat) diff --git a/src/eurostat/eurostat_data_functions.cpp b/src/eurostat/eurostat_data_functions.cpp index 6bbb207..3d40e71 100644 --- a/src/eurostat/eurostat_data_functions.cpp +++ b/src/eurostat/eurostat_data_functions.cpp @@ -272,8 +272,7 @@ struct ES_Read { } url_count++; - auto response = - HttpRequest::ExecuteHttpRequest(settings, data_url, "GET", duckdb_httplib_openssl::Headers(), "", ""); + auto response = HttpRequest::ExecuteHttpRequest(settings, data_url, "GET", HttpHeaders(), "", ""); if (response.content_type == "application/xml") { std::string error_msg = EurostatUtils::GetXmlErrorMessage(response.body); diff --git a/src/eurostat/eurostat_info_functions.cpp b/src/eurostat/eurostat_info_functions.cpp index 8e5c69b..a9ed1b6 100644 --- a/src/eurostat/eurostat_info_functions.cpp +++ b/src/eurostat/eurostat_info_functions.cpp @@ -444,8 +444,7 @@ struct ES_Dataflows { // Execute HTTP GET request - auto response = - HttpRequest::ExecuteHttpRequest(settings, url, "GET", duckdb_httplib_openssl::Headers(), "", ""); + auto response = HttpRequest::ExecuteHttpRequest(settings, url, "GET", HttpHeaders(), "", ""); if (response.status_code != 200) { throw IOException( @@ -708,8 +707,7 @@ struct ES_DataStructure { "/latest?detail=referencepartial&references=descendants"; HttpSettings settings = HttpRequest::ExtractHttpSettings(context, url); - auto response = - HttpRequest::ExecuteHttpRequest(settings, url, "GET", duckdb_httplib_openssl::Headers(), "", ""); + auto response = HttpRequest::ExecuteHttpRequest(settings, url, "GET", HttpHeaders(), "", ""); if (response.status_code != 200) { throw IOException("EUROSTAT: Failed to fetch dataflow metadata from provider='%s', dataflow='%s': (%d) %s", @@ -818,8 +816,7 @@ struct ES_DataStructure { string url = it->second.api_url + "contentconstraint/" + provider_id + "/" + dataflow_id; HttpSettings settings = HttpRequest::ExtractHttpSettings(context, url); - auto response = - HttpRequest::ExecuteHttpRequest(settings, url, "GET", duckdb_httplib_openssl::Headers(), "", ""); + auto response = HttpRequest::ExecuteHttpRequest(settings, url, "GET", HttpHeaders(), "", ""); if (response.status_code != 200) { throw IOException("EUROSTAT: Failed to fetch dataflow metadata from provider='%s', dataflow='%s': (%d) %s", diff --git a/src/eurostat/http_request.cpp b/src/eurostat/http_request.cpp index 08c544d..bb628d6 100644 --- a/src/eurostat/http_request.cpp +++ b/src/eurostat/http_request.cpp @@ -232,10 +232,87 @@ HttpSettings HttpRequest::ExtractHttpSettings(ClientContext &context, const stri return settings; } +#ifdef __EMSCRIPTEN__ +#include + +// Execute HTTP request using synchronous XHR (works in Web Worker context where duckdb-wasm always runs). +// Uses arraybuffer to safely receive binary/compressed responses, then applies C++ decompression. +HttpResponseData HttpRequest::ExecuteHttpRequest(const HttpSettings &settings, const string &url, const string &method, + const HttpHeaders &headers, const string &request_body, + const string &content_type) { + HttpResponseData result; + result.status_code = 0; + result.content_length = -1; + + try { + int32_t status_code = 0; + int32_t body_len = 0; + + // Use arraybuffer to get raw bytes (safe for compressed/binary responses) + char *body_ptr = (char *)EM_ASM_PTR( + { + try { + var url = UTF8ToString($0); + var method = UTF8ToString($1); + var xhr = new XMLHttpRequest(); + xhr.open(method, url, false); // false = synchronous + xhr.responseType = 'arraybuffer'; + xhr.send(null); + HEAP32[$2 >> 2] = xhr.status; + if (xhr.response && xhr.response.byteLength > 0) { + var bytes = new Uint8Array(xhr.response); + var len = bytes.length; + HEAP32[$3 >> 2] = len; + var ptr = _malloc(len + 1); + HEAPU8.set(bytes, ptr); + HEAPU8[ptr + len] = 0; + return ptr; + } + HEAP32[$3 >> 2] = 0; + return 0; + } catch (e) { + HEAP32[$2 >> 2] = 0; + HEAP32[$3 >> 2] = 0; + return 0; + } + }, + url.c_str(), method.c_str(), &status_code, &body_len); + + result.status_code = status_code; + if (result.status_code == 0) { + result.error = "HTTP request failed (XHR error)"; + } + if (body_ptr && body_len > 0) { + string raw_body(body_ptr, static_cast(body_len)); + free(body_ptr); + // Auto-decompress (same logic as native path) + try { + if (GZipFileSystem::CheckIsZip(raw_body.data(), raw_body.size())) { + result.body = GZipFileSystem::UncompressGZIPString(raw_body); + } else if (CheckIsZstd(raw_body.data(), raw_body.size())) { + result.body = DecompressZstd(raw_body); + } else { + result.body = std::move(raw_body); + } + } catch (...) { + result.body = std::move(raw_body); + } + } else if (body_ptr) { + free(body_ptr); + } + } catch (std::exception &e) { + result.error = e.what(); + } + + return result; +} + +#else + // Execute HTTP request with given settings HttpResponseData HttpRequest::ExecuteHttpRequest(const HttpSettings &settings, const string &url, const string &method, - const duckdb_httplib_openssl::Headers &headers, - const string &request_body, const string &content_type) { + const HttpHeaders &headers, const string &request_body, + const string &content_type) { HttpResponseData result; result.status_code = 0; result.content_length = -1; @@ -266,7 +343,10 @@ HttpResponseData HttpRequest::ExecuteHttpRequest(const HttpSettings &settings, c } } - duckdb_httplib_openssl::Headers req_headers = headers; + duckdb_httplib_openssl::Headers req_headers; + for (auto &h : headers) { + req_headers.insert({h.first, h.second}); + } if (req_headers.find("User-Agent") == req_headers.end()) { req_headers.insert({"User-Agent", settings.user_agent}); } @@ -344,4 +424,6 @@ HttpResponseData HttpRequest::ExecuteHttpRequest(const HttpSettings &settings, c return result; } +#endif // __EMSCRIPTEN__ + } // namespace duckdb diff --git a/src/eurostat/http_request.hpp b/src/eurostat/http_request.hpp index 9f68fe5..e262e49 100644 --- a/src/eurostat/http_request.hpp +++ b/src/eurostat/http_request.hpp @@ -2,9 +2,11 @@ #include "duckdb.hpp" +#ifndef __EMSCRIPTEN__ // Use httplib directly for full HTTP method support #define CPPHTTPLIB_OPENSSL_SUPPORT #include "httplib.hpp" +#endif namespace duckdb { @@ -26,6 +28,9 @@ struct HttpSettings { bool follow_redirects; }; +//! Struct to hold HTTP headers map +using HttpHeaders = unordered_map; + //! Struct to hold HTTP response struct HttpResponseData { int32_t status_code; @@ -45,8 +50,8 @@ struct HttpRequest { // Execute HTTP request with given settings static HttpResponseData ExecuteHttpRequest(const HttpSettings &settings, const string &url, const string &method, - const duckdb_httplib_openssl::Headers &headers, - const string &request_body, const string &content_type); + const HttpHeaders &headers, const string &request_body, + const string &content_type); }; } // namespace duckdb