From 53fee9c2f72be3d14e63c419161b8e5a98f383af Mon Sep 17 00:00:00 2001
From: Rong Ma <rongma1997@gmail.com>
Date: Tue, 11 Mar 2025 16:27:43 +0000
Subject: [PATCH 1/3] Update agg/generator functions support

---
 ...elox-backend-aggregate-function-support.md |  71 +++++
 ...elox-backend-generator-function-support.md |  16 +
 .../apache/spark/sql/GlutenTestUtils.scala    |   3 +-
 tools/scripts/gen-function-support-docs.py    | 298 ++++++++++++------
 4 files changed, 289 insertions(+), 99 deletions(-)
 create mode 100644 docs/velox-backend-aggregate-function-support.md
 create mode 100644 docs/velox-backend-generator-function-support.md

diff --git a/docs/velox-backend-aggregate-function-support.md b/docs/velox-backend-aggregate-function-support.md
new file mode 100644
index 000000000000..910a2573a914
--- /dev/null
+++ b/docs/velox-backend-aggregate-function-support.md
@@ -0,0 +1,71 @@
+# Aggregate Functions Support Status
+
+**Out of 62 aggregate functions in Spark 3.5, Gluten currently fully supports 54 functions and partially supports 1 functions.**
+
+## Aggregate Functions
+
+| Spark Functions       | Spark Expressions                  | Status   | Restrictions   |
+|-----------------------|------------------------------------|----------|----------------|
+| any                   | BoolOr                             | S        |                |
+| any_value             | AnyValue                           | S        |                |
+| approx_count_distinct | HyperLogLogPlusPlus                | S        |                |
+| approx_percentile     | ApproximatePercentile              | S        |                |
+| array_agg             | CollectList                        | S        |                |
+| avg                   | Average                            | S        |                |
+| bit_and               | BitAndAgg                          | S        |                |
+| bit_or                | BitOrAgg                           | S        |                |
+| bit_xor               | BitXorAgg                          | S        |                |
+| bitmap_construct_agg  | BitmapConstructAgg                 |          |                |
+| bitmap_or_agg         | BitmapOrAgg                        |          |                |
+| bool_and              | BoolAnd                            | S        |                |
+| bool_or               | BoolOr                             | S        |                |
+| collect_list          | CollectList                        | S        |                |
+| collect_set           | CollectSet                         | S        |                |
+| corr                  | Corr                               | S        |                |
+| count                 | Count                              | S        |                |
+| count_if              | CountIf                            | S        |                |
+| count_min_sketch      | CountMinSketchAggExpressionBuilder |          |                |
+| covar_pop             | CovPopulation                      | S        |                |
+| covar_samp            | CovSample                          | S        |                |
+| every                 | BoolAnd                            | S        |                |
+| first                 | First                              | S        |                |
+| first_value           | First                              | S        |                |
+| grouping              | Grouping                           | S        |                |
+| grouping_id           | GroupingID                         | S        |                |
+| histogram_numeric     | HistogramNumeric                   |          |                |
+| hll_sketch_agg        | HllSketchAgg                       |          |                |
+| hll_union_agg         | HllUnionAgg                        |          |                |
+| kurtosis              | Kurtosis                           | S        |                |
+| last                  | Last                               | S        |                |
+| last_value            | Last                               | S        |                |
+| max                   | Max                                | S        |                |
+| max_by                | MaxBy                              | S        |                |
+| mean                  | Average                            | S        |                |
+| median                | Median                             | S        |                |
+| min                   | Min                                | S        |                |
+| min_by                | MinBy                              | S        |                |
+| mode                  | Mode                               |          |                |
+| percentile            | Percentile                         | S        |                |
+| percentile_approx     | ApproximatePercentile              | S        |                |
+| regr_avgx             | RegrAvgX                           | S        |                |
+| regr_avgy             | RegrAvgY                           | S        |                |
+| regr_count            | RegrCount                          | S        |                |
+| regr_intercept        | RegrIntercept                      | S        |                |
+| regr_r2               | RegrR2                             | S        |                |
+| regr_slope            | RegrSlope                          | S        |                |
+| regr_sxx              | RegrSXX                            | S        |                |
+| regr_sxy              | RegrSXY                            | S        |                |
+| regr_syy              | RegrSYY                            | S        |                |
+| skewness              | Skewness                           | S        |                |
+| some                  | BoolOr                             | S        |                |
+| std                   | StddevSamp                         | S        |                |
+| stddev                | StddevSamp                         | S        |                |
+| stddev_pop            | StddevPop                          | S        |                |
+| stddev_samp           | StddevSamp                         | S        |                |
+| sum                   | Sum                                | S        |                |
+| try_avg               | TryAverageExpressionBuilder        | S        |                |
+| try_sum               | TrySumExpressionBuilder            | PS       |                |
+| var_pop               | VariancePop                        | S        |                |
+| var_samp              | VarianceSamp                       | S        |                |
+| variance              | VarianceSamp                       | S        |                |
+
diff --git a/docs/velox-backend-generator-function-support.md b/docs/velox-backend-generator-function-support.md
new file mode 100644
index 000000000000..ec57535a93e6
--- /dev/null
+++ b/docs/velox-backend-generator-function-support.md
@@ -0,0 +1,16 @@
+# Generator Functions Support Status
+
+**Out of 7 generator functions in Spark 3.5, Gluten currently fully supports 4 functions.**
+
+## Generator Functions
+
+| Spark Functions   | Spark Expressions        | Status   | Restrictions   |
+|-------------------|--------------------------|----------|----------------|
+| explode           | ExplodeExpressionBuilder | S        |                |
+| explode_outer     | ExplodeExpressionBuilder |          |                |
+| inline            | Inline                   | S        |                |
+| inline_outer      | Inline                   |          |                |
+| posexplode        | PosExplode               | S        |                |
+| posexplode_outer  | PosExplode               |          |                |
+| stack             | Stack                    | S        |                |
+
diff --git a/gluten-substrait/src/test/scala/org/apache/spark/sql/GlutenTestUtils.scala b/gluten-substrait/src/test/scala/org/apache/spark/sql/GlutenTestUtils.scala
index 35fe9518cee3..ef97ca3e02b3 100644
--- a/gluten-substrait/src/test/scala/org/apache/spark/sql/GlutenTestUtils.scala
+++ b/gluten-substrait/src/test/scala/org/apache/spark/sql/GlutenTestUtils.scala
@@ -18,8 +18,7 @@ package org.apache.spark.sql
 
 import org.apache.gluten.exception.GlutenException
 
-import org.apache.spark.SparkContext
-import org.apache.spark.TestUtils
+import org.apache.spark.{SparkContext, TestUtils}
 import org.apache.spark.scheduler.SparkListener
 import org.apache.spark.sql.test.SQLTestUtils
 
diff --git a/tools/scripts/gen-function-support-docs.py b/tools/scripts/gen-function-support-docs.py
index 1e09c45e5cec..87affacc1c97 100644
--- a/tools/scripts/gen-function-support-docs.py
+++ b/tools/scripts/gen-function-support-docs.py
@@ -502,15 +502,22 @@
     expression[StructsToCsv]("to_csv")
 '''
 
+FUNCTION_CATEGORIES = ['scalar', 'aggregate', 'window', 'generator']
+
 # Known Restrictions in Gluten.
 LOOKAROUND_UNSUPPORTED = 'Lookaround unsupported'
 GLUTEN_RESTRICTIONS = {
-    'regexp': LOOKAROUND_UNSUPPORTED,
-    'regexp_like': LOOKAROUND_UNSUPPORTED,
-    'rlike': LOOKAROUND_UNSUPPORTED,
-    'regexp_extract': LOOKAROUND_UNSUPPORTED,
-    'regexp_extract_all': LOOKAROUND_UNSUPPORTED,
-    'regexp_replace': LOOKAROUND_UNSUPPORTED
+    'scalar': {
+        'regexp': LOOKAROUND_UNSUPPORTED,
+        'regexp_like': LOOKAROUND_UNSUPPORTED,
+        'rlike': LOOKAROUND_UNSUPPORTED,
+        'regexp_extract': LOOKAROUND_UNSUPPORTED,
+        'regexp_extract_all': LOOKAROUND_UNSUPPORTED,
+        'regexp_replace': LOOKAROUND_UNSUPPORTED
+    },
+    'aggregate': {},
+    'window': {},
+    'generator': {}
 }
 
 SPARK_FUNCTION_GROUPS = {
@@ -553,15 +560,26 @@
                           'xml_funcs': "XML Functions"}
 
 FUNCTION_GROUPS = {'scalar': SCALAR_FUNCTION_GROUPS,
-                   'agg': {'agg_funcs': 'Aggregate Functions'},
+                   'aggregate': {'agg_funcs': 'Aggregate Functions'},
                    'window': {'window_funcs': 'Window Functions'},
                    'generator': {'generator_funcs': "Generator Functions"}}
 
+FUNCTION_SUITE_PACKAGE = 'org.apache.spark.sql.'
+FUNCTION_SUITES = {
+    'scalar': {'GlutenSQLQueryTestSuite'},
+    'aggregate': {'GlutenSQLQueryTestSuite', 'GlutenApproxCountDistinctForIntervalsQuerySuite',
+                  'GlutenBitmapExpressionsQuerySuite',
+                  'GlutenDataFrameAggregateSuite'},
+    'window': {'GlutenSQLQueryTestSuite'},
+    'generator': {'GlutenGeneratorFunctionSuite'}
+}
+
 
 def create_spark_function_map():
     exprs = list(map(lambda x: x if x[-1] != ',' else x[:-1],
                      map(lambda x: x.strip(),
-                         filter(lambda x: 'expression' in x, SPARK35_EXPRESSION_MAPPINGS.split('\n')))))
+                         filter(lambda x: 'expression' in x,
+                                SPARK35_EXPRESSION_MAPPINGS.split('\n')))))
 
     func_map = {}
     expression_pattern = 'expression[GeneratorOuter]*\[([\w0-9]+)\]\("([^\s]+)".*'
@@ -589,7 +607,8 @@ def create_spark_function_map():
 def generate_function_list():
     jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
 
-    infos = [["!=", '', 'predicate_funcs'], ["<>", "", "predicate_funcs"], ['between', '', 'predicate_funcs'],
+    infos = [["!=", '', 'predicate_funcs'], ["<>", "", "predicate_funcs"],
+             ['between', '', 'predicate_funcs'],
              ['case', '', 'predicate_funcs'], ["||", '', 'misc_funcs']]
     for jinfo in filter(lambda x: x.getGroup() in SPARK_FUNCTION_GROUPS, jinfos):
         infos.append([jinfo.getName(), jinfo.getClassName().split('.')[-1], jinfo.getGroup()])
@@ -608,61 +627,68 @@ def generate_function_list():
 
         group_functions[groupname].append(name)
         if groupname in SCALAR_FUNCTION_GROUPS:
-            scalar_functions.append(name)
+            functions['scalar'].add(name)
         elif groupname == 'agg_funcs':
-            agg_functions.append(name)
+            functions['aggregate'].add(name)
         elif groupname == 'window_funcs':
-            window_functions.append(name)
+            functions['window'].add(name)
         elif groupname == 'generator_funcs':
-            generator_functions.append(name)
+            functions['generator'].add(name)
         else:
-            logging.log(logging.WARNING, f"No matching group name for function {name}: " + groupname)
+            logging.log(logging.WARNING,
+                        f"No matching group name for function {name}: " + groupname)
 
 
 def parse_logs(log_file):
-    generator_functions = ['explode', 'explode_outer', 'inline', 'inline_outer', 'posexplode', 'posexplode_outer',
-                           'stack']
+    generator_functions = ['explode', 'explode_outer', 'inline', 'inline_outer', 'posexplode',
+                           'posexplode_outer', 'stack']
+
+    # unknown functions are not in the all_function_names list. Perhaps spark implemented this function but did not
+    # expose it to the user for current version.
+    support_list = {'scalar': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
+                    'aggregate': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
+                    'generator': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()},
+                    'window': {'partial': set(), 'unsupported': set(), 'unsupported_expr': set(), 'unknown': set()}}
 
-    scalar_support_list = {'partial': set(), 'unsupported': set()}
-    agg_support_list = {'partial': set(), 'unsupported': set()}
-    window_support_list = {'partial': set(), 'unsupported': set()}
-    generator_support_list = {'partial': set(), 'unsupported': set()}
-    try_to_binary_funcs = set(['unhex', 'encode', 'unbase64'])
+    try_to_binary_funcs = {'unhex', 'encode', 'unbase64'}
 
     unresolved = []
 
     def filter_fallback_reasons():
-        f = open(log_file, 'r')
-        lines = f.readlines()
-        lines
-        ll = []
+        with open(log_file, 'r') as f:
+            lines = f.readlines()
+
+        validation_logs = []
 
         # Filter validation logs.
         for l in lines:
-            if (
-                    'Validation failed for plan:' in l or 'Validation failed due to' in l or 'Validation failed at file' in l or l.startswith(
-                ' - ') or l.startswith('   |- ')) and 'Native validation failed:' not in l:
-                ll.append(l)
+            if ('Validation failed for plan:' in l or
+                    'Validation failed due to' in l or
+                    'Validation failed at file' in l or
+                    l.startswith(' - ') and 'Native validation failed:' not in l or
+                    l.startswith('   |- ')):
+                validation_logs.append(l)
 
         # Extract fallback reasons.
-        al = []
-        for l in ll:
+        fallback_reasons = set()
+        for l in validation_logs:
             if 'due to:' in l:
-                al.append(l.split('due to:')[-1].strip())
+                fallback_reasons.add(l.split('due to:')[-1].strip())
             elif 'reason:' in l:
-                al.append(l.split('reason:')[-1].strip())
+                fallback_reasons.add(l.split('reason:')[-1].strip())
             else:
-                al.append(l)
-        al = sorted(set(al))
+                fallback_reasons.add(l)
+        fallback_reasons = sorted(fallback_reasons)
 
         # Remove udf.
-        return list(filter(lambda x: 'Not supported python udf' not in x and 'Not supported scala udf' not in x, al))
+        return list(filter(lambda x: 'Not supported python udf' not in x and 'Not supported scala udf' not in x,
+                           fallback_reasons))
 
     def function_name_tuple(function_name):
         return (
             function_name, None if function_name not in function_to_classname else function_to_classname[function_name])
 
-    def notFound(r):
+    def function_not_found(r):
         logging.log(logging.WARNING, f"No function name or class name found in: {r}")
         unresolved.append(r)
 
@@ -672,14 +698,19 @@ def notFound(r):
     for item in jexpression_mappings:
         gluten_expressions[item._1()] = item._2()
 
-    for f in scalar_functions:
-        if f not in gluten_expressions.values() and function_to_classname[f] not in gluten_expressions.keys():
-            scalar_support_list['unsupported'].add(function_name_tuple(f))
+    for category in FUNCTION_CATEGORIES:
+        if category == 'scalar':
+            for f in functions[category]:
+                # TODO: Remove this filter as it may exclude supported expressions, such as
+                #  RuntimeReplaceable and Builder.
+                if f not in gluten_expressions.values() and function_to_classname[f] not in gluten_expressions.keys():
+                    support_list[category]['unsupported'].add(function_name_tuple(f))
 
-    for f in GLUTEN_RESTRICTIONS.keys():
-        scalar_support_list['partial'].add(function_name_tuple(f))
+        for f in GLUTEN_RESTRICTIONS[category].keys():
+            support_list[category]['partial'].add(function_name_tuple(f))
 
     for r in filter_fallback_reasons():
+        ############## Scalar functions ##############
         if 'Not supported to map spark function name to substrait function name' in r:
             pattern = r"class name: ([\w0-9]+)."
 
@@ -689,12 +720,17 @@ def notFound(r):
             if match:
                 class_name = match.group(1)
                 if class_name in classname_to_function:
-                    scalar_support_list['unsupported'].add((classname_to_function[class_name], class_name))
+                    function_name = classname_to_function[class_name]
+                    if function_name in all_function_names:
+                        support_list['scalar']['unsupported'].add((function_name, class_name))
+                    else:
+                        support_list['scalar']['unknown'].add((function_name, class_name))
                 else:
-                    logging.log(logging.INFO, f"No function name for class: {class_name}. Adding class name")
-                    scalar_support_list['unsupported'].add((None, class_name))
+                    logging.log(logging.INFO,
+                                f"No function name for class: {class_name}. Adding class name")
+                    support_list['scalar']['unsupported_expr'].add(class_name)
             else:
-                notFound(r)
+                function_not_found(r)
 
         elif 'Not support expression' in r:
             pattern = r"Not support expression ([\w0-9]+)"
@@ -705,12 +741,17 @@ def notFound(r):
             if match:
                 class_name = match.group(1)
                 if class_name in classname_to_function:
-                    scalar_support_list['unsupported'].add((classname_to_function[class_name], class_name))
+                    function_name = classname_to_function[class_name]
+                    if function_name in all_function_names:
+                        support_list['scalar']['unsupported'].add((function_name, class_name))
+                    else:
+                        support_list['scalar']['unknown'].add((function_name, class_name))
                 else:
-                    logging.log(logging.INFO, f"No function name for class: {class_name}. Adding class name")
-                    scalar_support_list['unsupported'].add((None, class_name))
+                    logging.log(logging.INFO,
+                                f"No function name for class: {class_name}. Adding class name")
+                    support_list['scalar']['unsupported_expr'].add(class_name)
             else:
-                notFound(r)
+                function_not_found(r)
 
         elif 'Scalar function name not registered:' in r:
             pattern = r"Scalar function name not registered:\s+([\w0-9]+)"
@@ -720,9 +761,12 @@ def notFound(r):
 
             if match:
                 function_name = match.group(1)
-                scalar_support_list['unsupported'].add(function_name_tuple(function_name))
+                if function_name in all_function_names:
+                    support_list['scalar']['unsupported'].add(function_name_tuple(function_name))
+                else:
+                    support_list['scalar']['unknown'].add(function_name_tuple(function_name))
             else:
-                notFound(r)
+                function_not_found(r)
 
         elif 'Function is not supported:' in r:
             pattern = r"Function is not supported:\s+([\w0-9]+)"
@@ -732,9 +776,12 @@ def notFound(r):
 
             if match:
                 function_name = match.group(1)
-                scalar_support_list['unsupported'].add(function_name_tuple(function_name))
+                if function_name in all_function_names:
+                    support_list['scalar']['unsupported'].add(function_name_tuple(function_name))
+                else:
+                    support_list['scalar']['unknown'].add(function_name_tuple(function_name))
             else:
-                notFound(r)
+                function_not_found(r)
 
         elif 'not registered with arguments:' in r:
             pattern = r"Scalar function ([\w0-9]+) not registered with arguments:"
@@ -744,10 +791,14 @@ def notFound(r):
 
             if match:
                 function_name = match.group(1)
-                scalar_support_list['partial'].add(function_name_tuple(function_name))
+                if function_name in all_function_names:
+                    support_list['scalar']['partial'].add(function_name_tuple(function_name))
+                else:
+                    support_list['scalar']['unknown'].add(function_name_tuple(function_name))
             else:
-                notFound(r)
+                function_not_found(r)
 
+        ############## Aggregate functions ##############
         elif 'Could not find a valid substrait mapping' in r:
             pattern = r"Could not find a valid substrait mapping name for ([\w0-9]+)\("
 
@@ -756,10 +807,29 @@ def notFound(r):
 
             if match:
                 function_name = match.group(1)
-                agg_support_list['unsupported'].add(function_name_tuple(function_name))
+                if function_name in all_function_names:
+                    support_list['aggregate']['unsupported'].add(function_name_tuple(function_name))
+                else:
+                    support_list['aggregate']['unknown'].add(function_name_tuple(function_name))
             else:
-                notFound(r)
+                function_not_found(r)
+
+        elif 'Unsupported aggregate mode' in r:
+            pattern = r"Unsupported aggregate mode: [\w]+ for ([\w0-9]+)"
+
+            # Extract the function name
+            match = re.search(pattern, r)
 
+            if match:
+                function_name = match.group(1)
+                if function_name in all_function_names:
+                    support_list['aggregate']['partial'].add(function_name_tuple(function_name))
+                else:
+                    support_list['aggregate']['unknown'].add(function_name_tuple(function_name))
+            else:
+                function_not_found(r)
+
+        ############## Generator functions ##############
         elif 'Velox backend does not support this generator:' in r:
             pattern = r"Velox backend does not support this generator:\s+([\w0-9]+)"
 
@@ -770,14 +840,15 @@ def notFound(r):
                 class_name = match.group(1)
                 function_name = class_name.lower()
                 if function_name not in generator_functions:
-                    generator_support_list['unsupported'].add((None, class_name))
+                    support_list['generator']['unknown'].add((None, class_name))
                 elif 'outer: true' in r:
-                    generator_support_list['unsupported'].add((function_name + '_outer', None))
+                    support_list['generator']['unsupported'].add((function_name + '_outer', None))
                 else:
-                    generator_support_list['unsupported'].add(function_name_tuple(function_name))
+                    support_list['generator']['unsupported'].add(function_name_tuple(function_name))
             else:
-                notFound(r)
+                function_not_found(r)
 
+        ############## Special judgements ##############
         elif 'try_eval' in r and ' is not supported' in r:
             pattern = r"try_eval\((\w+)\) is not supported"
             match = re.search(pattern, r)
@@ -790,48 +861,53 @@ def notFound(r):
                     function_name = 'try_to_binary'
                     p = function_name_tuple(function_name)
                     if len(try_to_binary_funcs) == 0:
-                        if p in scalar_support_list['partial']:
-                            scalar_support_list['partial'].remove(p)
-                        scalar_support_list['unsupported'].add(p)
+                        if p in support_list['scalar']['partial']:
+                            support_list['scalar']['partial'].remove(p)
+                        support_list['scalar']['unsupported'].add(p)
 
                 elif 'add' in function_name:
                     function_name = 'try_add'
-                    scalar_support_list['partial'].add(function_name_tuple(function_name))
+                    support_list['scalar']['partial'].add(function_name_tuple(function_name))
             else:
-                notFound(r)
+                function_not_found(r)
 
         elif 'Pattern is not string literal for regexp_extract' == r:
             function_name = 'regexp_extract'
-            scalar_support_list['partial'].add(function_name_tuple(function_name))
+            support_list['scalar']['partial'].add(function_name_tuple(function_name))
 
         elif 'Pattern is not string literal for regexp_extract_all' == r:
             function_name = 'regexp_extract_all'
-            scalar_support_list['partial'].add(function_name_tuple(function_name))
+            support_list['scalar']['partial'].add(function_name_tuple(function_name))
 
         else:
             unresolved.append(r)
 
-    return scalar_support_list, agg_support_list, window_support_list, generator_support_list, unresolved
+    return support_list, unresolved
 
 
-def generate_function_doc(category, function_support_list, output):
-    num_unsupported = len(list(filter(lambda x: x[0] is not None, function_support_list['unsupported'])))
-    num_unsupported_expression = len(
-        list(filter(lambda x: x[0] is None and x[1] is not None, function_support_list['unsupported'])))
-    num_partially_supported = len(list(filter(lambda x: x[0] is not None, function_support_list['partial'])))
-    num_supported = len(scalar_functions) - num_unsupported - num_partially_supported
+def generate_function_doc(category, output):
+    num_unsupported = len(list(filter(lambda x: x[0] is not None, support_list[category]['unsupported'])))
+    num_unsupported_expression = len(support_list[category]['unsupported_expr'])
+    num_unknown_function = len(support_list[category]['unknown'])
+    num_partially_supported = len(list(filter(lambda x: x[0] is not None, support_list[category]['partial'])))
+    num_supported = len(functions[category]) - num_unsupported - num_partially_supported
 
-    logging.log(logging.WARNING, f'Number of {category} functions: {len(scalar_functions)}')
-    logging.log(logging.WARNING, f'Number of unsupported {category} functions: {num_unsupported}')
-    logging.log(logging.WARNING, f'Number of unsupported {category} expressions: {num_unsupported_expression}')
-    logging.log(logging.WARNING, f'Number of partially supported {category} function: {num_partially_supported}')
+    logging.log(logging.WARNING, f'Number of {category} functions: {len(functions[category])}')
     logging.log(logging.WARNING, f'Number of fully supported {category} function: {num_supported}')
+    logging.log(logging.WARNING, f'Number of unsupported {category} functions: {num_unsupported}')
+    logging.log(logging.WARNING,
+                f'Number of partially supported {category} function: {num_partially_supported}')
+    logging.log(logging.WARNING,
+                f'Number of unsupported {category} expressions: {num_unsupported_expression}')
+    logging.log(logging.WARNING,
+                f'Number of unknown {category} function: {num_unknown_function}. List: {support_list[category]["unknown"]}')
 
     headers = ['Spark Functions', 'Spark Expressions', 'Status', 'Restrictions']
 
+    partially_supports = '.' if not num_partially_supported else f' and partially supports {num_partially_supported} functions.'
     lines = f'''# {category.capitalize()} Functions Support Status
 
-**Out of {len(scalar_functions)} {category} functions in Spark 3.5, Gluten currently fully supports {num_supported} functions and partially supports {num_partially_supported} functions.**
+**Out of {len(functions[category])} {category} functions in Spark 3.5, Gluten currently fully supports {num_supported} functions{partially_supports}**
 
 '''
 
@@ -842,12 +918,12 @@ def generate_function_doc(category, function_support_list, output):
             for f in sorted(group_functions[g]):
                 classname = '' if f not in spark_function_map else spark_function_map[f]
                 support = None
-                for item in function_support_list['partial']:
+                for item in support_list[category]['partial']:
                     if item[0] and item[0] == f or item[1] and item[1] == classname:
                         support = 'PS'
                         break
                 if support is None:
-                    for item in function_support_list['unsupported']:
+                    for item in support_list[category]['unsupported']:
                         if item[0] and item[0] == f or item[1] and item[1] == classname:
                             support = ''
                             break
@@ -857,7 +933,8 @@ def generate_function_doc(category, function_support_list, output):
                     f = '&#124;'
                 elif f == '||':
                     f = '&#124;&#124;'
-                data.append([f, classname, support, '' if f not in GLUTEN_RESTRICTIONS else GLUTEN_RESTRICTIONS[f]])
+                data.append([f, classname, support,
+                             '' if f not in GLUTEN_RESTRICTIONS[category] else GLUTEN_RESTRICTIONS[category][f]])
             table = tabulate.tabulate(data, headers, tablefmt="github")
             lines += table + '\n\n'
 
@@ -865,14 +942,25 @@ def generate_function_doc(category, function_support_list, output):
         fd.write(lines)
 
 
-def run_GlutenSQLQueryTestSuite():
+def run_test_suites(categories):
     log4j_properties_file = os.path.abspath(
         os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log4j2.properties'))
+
+    suite_list = []
+    for category in categories:
+        if FUNCTION_SUITES[category]:
+            suite_list.append(','.join([FUNCTION_SUITE_PACKAGE + name for name in FUNCTION_SUITES[category]]))
+    suites = ','.join(suite_list)
+
+    if not suites:
+        logging.log(logging.WARNING, "No test suites to run.")
+        return
+
     command = [
         "mvn", "test",
         "-Pspark-3.5", "-Pspark-ut", "-Pbackends-velox",
         f"-DargLine=-Dspark.test.home={spark_home} -Dlog4j2.configurationFile=file:{log4j_properties_file}",
-        "-DwildcardSuites=org.apache.spark.sql.GlutenSQLQueryTestSuite",
+        f"-DwildcardSuites={suites}",
         "-Dtest=none",
         "-Dsurefire.failIfNoSpecifiedTests=false"
     ]
@@ -880,23 +968,41 @@ def run_GlutenSQLQueryTestSuite():
     subprocess.Popen(command, cwd=gluten_home).wait()
 
 
+def get_maven_project_version():
+    result = subprocess.run(
+        ['mvn', 'help:evaluate', '-Dexpression=project.version', '-q', '-DforceStdout'],
+        capture_output=True,
+        text=True,
+        cwd=gluten_home
+    )
+    if result.returncode == 0:
+        version = result.stdout.strip()
+        return version
+    else:
+        raise RuntimeError(f"Error running Maven command: {result.stderr}")
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--spark_home", type=str, required=True,
                         help="Directory to spark source code for the newest supported spark version in Gluten. "
                              "It's required the spark project has been built from source.")
-    parser.add_argument("--skip_run_test_suite", action='store_true',
+    parser.add_argument("--skip_test_suite", action='store_true',
                         help="Whether to run test suite. Set to False to skip running the test suite.")
+    parser.add_argument("--categories", type=str, default=','.join(FUNCTION_CATEGORIES),
+                        help="Use comma-separated string to specify the function categories to generate the docs. "
+                             "Default is all categories.")
     args = parser.parse_args()
 
     spark_home = args.spark_home
     findspark.init(spark_home)
 
     gluten_home = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../'))
-    if not args.skip_run_test_suite:
-        run_GlutenSQLQueryTestSuite()
+    if not args.skip_test_suite:
+        run_test_suites(args.categories.split(','))
 
-    gluten_jar = os.path.join(gluten_home, 'package', 'target', 'gluten-package-1.4.0-SNAPSHOT.jar')
+    gluten_version = get_maven_project_version()
+    gluten_jar = os.path.join(gluten_home, 'package', 'target', f'gluten-package-{gluten_version}.jar')
     if not os.path.exists(gluten_jar):
         raise Exception(f"Gluten jar not found at {gluten_jar}")
 
@@ -910,10 +1016,7 @@ def run_GlutenSQLQueryTestSuite():
 
     # Generate the function list to the global variables.
     all_function_names = []
-    scalar_functions = []
-    agg_functions = []
-    window_functions = []
-    generator_functions = []
+    functions = {'scalar': set(), 'aggregate': set(), 'window': set(), 'generator': set()}
     classname_to_function = {}
     function_to_classname = {}
     group_functions = {}
@@ -921,8 +1024,9 @@ def run_GlutenSQLQueryTestSuite():
 
     spark_function_map = create_spark_function_map()
 
-    scalar_support_list, agg_support_list, window_support_list, generator_support_list, unresolved = parse_logs(
+    support_list, unresolved = parse_logs(
         os.path.join(gluten_home, 'gluten-ut', 'spark35', 'target', 'gen-function-support-docs-tests.log'))
 
-    generate_function_doc('scalar', scalar_support_list,
-                          os.path.join(gluten_home, 'docs', 'velox-backend-scalar-function-support.md'))
+    for category in args.categories.split(','):
+        generate_function_doc(category,
+                              os.path.join(gluten_home, 'docs', f'velox-backend-{category}-function-support.md'))

From 281d021f0ed33de66dbb8a339b88b15dbd87dbaa Mon Sep 17 00:00:00 2001
From: Rong Ma <rongma1997@gmail.com>
Date: Tue, 11 Mar 2025 19:07:49 +0000
Subject: [PATCH 2/3] Update window support

---
 docs/velox-backend-window-function-support.md | 18 ++++++++++++++++++
 tools/scripts/gen-function-support-docs.py    |  3 ++-
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 docs/velox-backend-window-function-support.md

diff --git a/docs/velox-backend-window-function-support.md b/docs/velox-backend-window-function-support.md
new file mode 100644
index 000000000000..85e6a7cbdfda
--- /dev/null
+++ b/docs/velox-backend-window-function-support.md
@@ -0,0 +1,18 @@
+# Window Functions Support Status
+
+**Out of 9 window functions in Spark 3.5, Gluten currently fully supports 9 functions.**
+
+## Window Functions
+
+| Spark Functions   | Spark Expressions   | Status   | Restrictions   |
+|-------------------|---------------------|----------|----------------|
+| cume_dist         | CumeDist            | S        |                |
+| dense_rank        | DenseRank           | S        |                |
+| lag               | Lag                 | S        |                |
+| lead              | Lead                | S        |                |
+| nth_value         | NthValue            | S        |                |
+| ntile             | NTile               | S        |                |
+| percent_rank      | PercentRank         | S        |                |
+| rank              | Rank                | S        |                |
+| row_number        | RowNumber           | S        |                |
+
diff --git a/tools/scripts/gen-function-support-docs.py b/tools/scripts/gen-function-support-docs.py
index 87affacc1c97..929a03a8efe5 100644
--- a/tools/scripts/gen-function-support-docs.py
+++ b/tools/scripts/gen-function-support-docs.py
@@ -570,7 +570,8 @@
     'aggregate': {'GlutenSQLQueryTestSuite', 'GlutenApproxCountDistinctForIntervalsQuerySuite',
                   'GlutenBitmapExpressionsQuerySuite',
                   'GlutenDataFrameAggregateSuite'},
-    'window': {'GlutenSQLQueryTestSuite'},
+    # All window functions are supported.
+    'window': {},
     'generator': {'GlutenGeneratorFunctionSuite'}
 }
 

From 8424096f34aae77ef4944dc68aa93e5b905f9a48 Mon Sep 17 00:00:00 2001
From: Rong Ma <rongma1997@gmail.com>
Date: Tue, 11 Mar 2025 19:11:56 +0000
Subject: [PATCH 3/3] update doc

---
 ...elox-backend-aggregate-function-support.md |  2 +-
 docs/velox-backend-support-progress.md        | 76 ++-----------------
 tools/scripts/gen-function-support-docs.py    |  6 +-
 3 files changed, 10 insertions(+), 74 deletions(-)

diff --git a/docs/velox-backend-aggregate-function-support.md b/docs/velox-backend-aggregate-function-support.md
index 910a2573a914..ee6fa131cddf 100644
--- a/docs/velox-backend-aggregate-function-support.md
+++ b/docs/velox-backend-aggregate-function-support.md
@@ -1,6 +1,6 @@
 # Aggregate Functions Support Status
 
-**Out of 62 aggregate functions in Spark 3.5, Gluten currently fully supports 54 functions and partially supports 1 functions.**
+**Out of 62 aggregate functions in Spark 3.5, Gluten currently fully supports 54 functions and partially supports 1 function.**
 
 ## Aggregate Functions
 
diff --git a/docs/velox-backend-support-progress.md b/docs/velox-backend-support-progress.md
index ab2ca76dd8e8..38159b74ffe9 100644
--- a/docs/velox-backend-support-progress.md
+++ b/docs/velox-backend-support-progress.md
@@ -103,74 +103,8 @@ Please check the links below for the detailed support status of each category:
 
 [Scalar Functions Support Status](./velox-backend-scalar-function-support.md)
 
-### Other Functions Support Status (To be updated)
-
-| Spark Functions       | Velox/Presto Functions | Velox/Spark functions | Gluten | Restrictions | BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | DATE | TIMESTAMP | STRING | DECIMAL | NULL | BINARY | CALENDAR | ARRAY | MAP | STRUCT | UDT |
-|-----------------------|------------------------|-----------------------|--------|--------------|---------|------|-------|-----|------|-------|--------|------|-----------|--------|---------|------|--------|----------|-------|-----|--------|-----|
-| bit_and               | bitwise_and_agg        |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| bit_or                |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| bit_xor               |                        | bit_xor               | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| explode               |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| explode_outer         |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| get_map_value         |                        | element_at            | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       | S   |        |     |
-| posexplode_outer      |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| any                   |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| approx_count_distinct | approx_distinct        |                       | S      |              | S       | S    | S     | S   | S    | S     | S      | S    |           | S      |         |      |        |          |       |     |        |     |
-| approx_percentile     |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| avg                   | avg                    |                       | S      | ANSI OFF     |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| bool_and              |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| bool_or               |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| collect_list          |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| collect_set           |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| corr                  | corr                   |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| count                 | count                  |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| count_if              | count_if               |                       |        |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| count_min_sketch      |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| covar_pop             | covar_pop              |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| covar_samp            | covar_samp             |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| every                 |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| first                 |                        | first                 | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| first_value           |                        | first_value           | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| grouping              |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| grouping_id           |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| kurtosis              | kurtosis               | kurtosis              | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| last                  |                        | last                  | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| last_value            |                        | last_value            | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| max                   | max                    |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| max_by                |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| mean                  | avg                    |                       | S      | ANSI OFF     |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| min                   | min                    |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| min_by                |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| regr_avgx             | regr_avgx              | regr_avgx             | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_avgy             | regr_avgy              | regr_avgy             | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_count            | regr_count             | regr_count            | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_r2               | regr_r2                | regr_r2               | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_intercept        | regr_intercept         | regr_intercept        | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_slope            | regr_slope             | regr_slope            | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_sxy              | regr_sxy               | regr_sxy              | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_sxx              | regr_sxx               | regr_sxx              | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| regr_syy              | regr_syy               | regr_syy              | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| skewness              | skewness               | skewness              | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| some                  |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| std                   | stddev                 |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| stddev                | stddev                 |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| stddev_pop            | stddev_pop             |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| stddev_samp           | stddev_samp            |                       | S      |              |         |      | S     | S   | S    | S     | S      |      |           |        |         |      |        |          |       |     |        |     |
-| sum                   | sum                    |                       | S      | ANSI OFF     |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| var_pop               | var_pop                |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| var_samp              | var_samp               |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| variance              | variance               |                       | S      |              |         | S    | S     | S   | S    | S     |        |      |           |        |         |      |        |          |       |     |        |     |
-| cume_dist             | cume_dist              |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| dense_rank            | dense_rank             |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| lag                   |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| lead                  |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| nth_value             | nth_value              | nth_value             | PS     |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| ntile                 | ntile                  | ntile                 | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| percent_rank          | percent_rank           |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| rank                  | rank                   |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| row_number            | row_number             |                       | S      |              |         |      | S     | S   | S    |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| inline                |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| inline_outer          |                        |                       |        |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| raise_error           |                        | raise_error           | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
-| stack                 |                        |                       | S      |              | S       | S    | S     | S   | S    | S     | S      | S    | S         | S      | S       | S    | S      | S        | S     | S   | S      | S   |
-| try_substract         |                        |                       | S      |              |         |      |       |     |      |       |        |      |           |        |         |      |        |          |       |     |        |     |
\ No newline at end of file
+[Aggregate Functions Support Status](./velox-backend-aggregate-function-support.md)
+
+[Window Functions Support Status](./velox-backend-window-function-support.md)
+
+[Generator Functions Support Status](./velox-backend-generator-function-support.md)
diff --git a/tools/scripts/gen-function-support-docs.py b/tools/scripts/gen-function-support-docs.py
index 929a03a8efe5..d46edfcc3b98 100644
--- a/tools/scripts/gen-function-support-docs.py
+++ b/tools/scripts/gen-function-support-docs.py
@@ -887,6 +887,8 @@ def function_not_found(r):
 
 
 def generate_function_doc(category, output):
+    def support_str(num_functions):
+        return f"{num_functions} functions" if num_functions > 1 else f"{num_functions} function"
     num_unsupported = len(list(filter(lambda x: x[0] is not None, support_list[category]['unsupported'])))
     num_unsupported_expression = len(support_list[category]['unsupported_expr'])
     num_unknown_function = len(support_list[category]['unknown'])
@@ -905,10 +907,10 @@ def generate_function_doc(category, output):
 
     headers = ['Spark Functions', 'Spark Expressions', 'Status', 'Restrictions']
 
-    partially_supports = '.' if not num_partially_supported else f' and partially supports {num_partially_supported} functions.'
+    partially_supports = '.' if not num_partially_supported else f' and partially supports {support_str(num_partially_supported)}.'
     lines = f'''# {category.capitalize()} Functions Support Status
 
-**Out of {len(functions[category])} {category} functions in Spark 3.5, Gluten currently fully supports {num_supported} functions{partially_supports}**
+**Out of {len(functions[category])} {category} functions in Spark 3.5, Gluten currently fully supports {support_str(num_supported)}{partially_supports}**
 
 '''