Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions mathics/builtin/atomic/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ class ToString(Builtin):
"""

options = {
"CharacterEncoding": '"Unicode"',
"CharacterEncoding": "$CharacterEncoding",
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For ToString, the default CharacterEncoding should be $CharacterEncoding

"FormatType": "OutputForm",
"NumberMarks": "$NumberMarks",
"PageHeight": "Infinity",
Expand All @@ -898,8 +898,21 @@ def eval_default(self, value, evaluation: Evaluation, options: dict):

def eval_form(self, expr, form, evaluation: Evaluation, options: dict):
"ToString[expr_, form_Symbol, OptionsPattern[ToString]]"
encoding = options["System`CharacterEncoding"]
return eval_ToString(expr, form, encoding.value, evaluation)
encoding = options["System`CharacterEncoding"].evaluate(evaluation)
if isinstance(encoding, String):
encoding_str = encoding.value
if encoding_str not in _encodings:
evaluation.message("$CharacterEncoding", "charcode", encoding)
encoding_str = evaluation.definitions.get_ownvalue(
"System`$SystemCharacterEncoding"
).value
else:
evaluation.message("$CharacterEncoding", "charcode", encoding)
encoding_str = evaluation.definitions.get_ownvalue(
"System`$SystemCharacterEncoding"
).value

return eval_ToString(expr, form, encoding_str, evaluation)


class Transliterate(Builtin):
Expand Down
1 change: 1 addition & 0 deletions mathics/core/element.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ def to_tex(self, **options) -> str:
return self.to_format("latex", **options)

def to_text(self, **options) -> str:
options.setdefault("encoding", "Unicode")
return self.to_format("text", **options)

# Deprecated
Expand Down
7 changes: 6 additions & 1 deletion mathics/core/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,10 +419,15 @@ def format_output(self, expr, format=None):
if result is None:
return None

try:
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ensures that when expressions are formatted as text, the encoding is always applied. With this change, if we specify that the encoding is "ASCII", all the tests must match with ASCII outputs.

encoding = self.definitions.get_ownvalue("System`$CharacterEncoding").value
except AttributeError:
encoding = "Unicode"

try:
# With the new implementation, if result is not a ``BoxExpression``
# then we should raise a BoxError here.
boxes = result.to_text(evaluation=self)
boxes = result.to_text(evaluation=self, encoding=encoding)
except BoxError:
self.message(
"General", "notboxes", Expression(SymbolFullForm, result).evaluate(self)
Expand Down
20 changes: 15 additions & 5 deletions mathics/doc/doc_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Tuple

from mathics.core.evaluation import Message, Print, _Out
from mathics.eval.encoding import encode_string_value

if TYPE_CHECKING:
from mathics.doc.structure import DocSection
Expand Down Expand Up @@ -406,14 +407,18 @@ def strip_sentinal(line: str):
def __str__(self) -> str:
return self.test

def compare(self, result: Optional[str], out: tuple = tuple()) -> bool:
def compare(
self, result: Optional[str], out: tuple = tuple(), encoding="ASCII"
) -> bool:
"""
Performs a doctest comparison between ``result`` and ``wanted`` and returns
True if the test should be considered a success.
"""
return self.compare_result(result) and self.compare_out(out)
return self.compare_result(result, encoding=encoding) and self.compare_out(
out, encoding=encoding
)

def compare_out(self, outs: tuple = tuple()) -> bool:
def compare_out(self, outs: tuple = tuple(), encoding="ASCII") -> bool:
"""Compare messages and warnings produced during the evaluation of
the test with the expected messages and warnings."""
# Check out
Expand All @@ -434,12 +439,14 @@ def tabs_to_spaces(val):
for got, wanted in zip(outs, wanted_outs):
if wanted.text == "...":
return True
if not tabs_to_spaces(got) == tabs_to_spaces(wanted):
if not tabs_to_spaces(got) == encode_string_value(
tabs_to_spaces(wanted), encoding=encoding
):
return False

return True

def compare_result(self, result: Optional[str]):
def compare_result(self, result: Optional[str], encoding="ASCII"):
"""Compare a result with the expected result"""
wanted = self.result
# Check result
Expand All @@ -458,6 +465,9 @@ def compare_result(self, result: Optional[str]):
return False

for res, want in zip(result_list, wanted_list):
# TODO_ Be more careful with special characters used in
# pattern matching.
want = encode_string_value(want, encoding=encoding)
wanted_re = re.escape(want.strip())
wanted_re = wanted_re.replace("\\.\\.\\.", ".*?")
wanted_re = f"^{wanted_re}$"
Expand Down
4 changes: 2 additions & 2 deletions mathics/doc/documentation/1-Manual.mdoc
Original file line number Diff line number Diff line change
Expand Up @@ -656,10 +656,10 @@ You can also specify a list of rules:
There is a "delayed" version of 'Rule' which can be specified by ':>' (similar to the relation of ':=' to '='):

>> a :> 1 + 2
= a :> 1 + 2
= a 1 + 2

>> a -> 1 + 2
= a -> 3
= a 3
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put this change just as an example. In principle, changing all the docstrings that involve Unicode symbols would make the docpipeline work with any encoding. I didn't do that because I would like to have some feedback before facing this task.
Also, I think that another possibility would be to use named characters instead of Unicode in these expected lines. But for it, I would also need to adjust some code in the doctest parser. @rocky, thoughts?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mmatera I thought about this for several minutes, and right now I just don't feel confident in suggesting anything one way or another. It might be something to talk over and discuss. For example, it might be that we decide to try one thing on a small scale, see how it goes, and then try another.

Is there some way we can discuss in a manner other than PR review comments?


This is useful when the right side of a rule should not be evaluated immediately (before matching):
>> {1, 2} /. x_Integer -> N[x]
Expand Down
11 changes: 3 additions & 8 deletions mathics/docpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
# When 3.8 is base, the below can be a Literal type.
INVALID_TEST_GROUP_SETUP = (None, None)

CHARACTER_ENCODING = settings.SYSTEM_CHARACTER_ENCODING

TestParameters = namedtuple(
"TestParameters",
[
Expand Down Expand Up @@ -138,10 +140,6 @@ def validate_group_setup(
else:
self.output_data = {}

# For consistency set the character encoding ASCII which is
# the lowest common denominator available on all systems.
settings.SYSTEM_CHARACTER_ENCODING = "ASCII"

if self.session.definitions is None:
self.print_and_log("Definitions are not initialized.")
return INVALID_TEST_GROUP_SETUP
Expand Down Expand Up @@ -224,7 +222,7 @@ def test_case(
return False

time_start = datetime.now()
comparison_result = test.compare_result(result)
comparison_result = test.compare_result(result, encoding=CHARACTER_ENCODING)

if test_parameters.check_partial_elapsed_time:
test_pipeline.print_and_log(
Expand Down Expand Up @@ -498,10 +496,7 @@ def test_tests(
"""
test_status: TestStatus = test_pipeline.status
test_parameters: TestParameters = test_pipeline.parameters
# For consistency set the character encoding ASCII which is
# the lowest common denominator available on all systems.

settings.SYSTEM_CHARACTER_ENCODING = "ASCII"
test_pipeline.reset_user_definitions()

output_data, names = test_pipeline.validate_group_setup(
Expand Down
72 changes: 72 additions & 0 deletions mathics/eval/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Functions to format strings in a given encoding.
"""

from typing import Dict

from mathics.core.convert.op import operator_to_ascii, operator_to_unicode

# Map WMA encoding names to Python encoding names
ENCODING_WMA_TO_PYTHON = {
"WindowsEastEurope": "cp1250",
"WindowsCyrillic": "cp1251",
"WindowsANSI": "cp1252",
"WindowsGreek": "cp1252",
"WindowsTurkish": "cp1254",
}

UNICODE_CHARACTER_TO_ASCII = {
ch: operator_to_ascii.get(name, rf"\[{name}]")
for name, ch in operator_to_unicode.items()
}

# These characters are used in encoding
# in WMA, and differs from what we have
# in Mathics3-scanner tables:
UNICODE_CHARACTER_TO_ASCII.update(
{
operator_to_unicode["Times"]: r" x ",
"": r"\[DifferentialD]",
Copy link
Copy Markdown
Member

@rocky rocky Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

 or F74C is the WL encoding of DifferentialD mentioned in named-characters.yml

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, all the letter-like characters are missing in the current approach. However, I guess this should be handled by adding a table that takes this into account.

}
)


class EncodingNameError(Exception):
pass


def get_encoding_table(encoding: str) -> Dict[str, str]:
"""
Return a dictionary with a map from
character codes in the internal (Unicode)
representation to the request encoding.
"""
if encoding == "Unicode":
return {}

# In the final implementation, this should load the corresponding
# json table or an encoding file as in WMA
# SystemFiles/CharacterEncodings/*.m
# If the encoding is not available, raise an EncodingError
try:
return {
"ASCII": UNICODE_CHARACTER_TO_ASCII,
"UTF-8": {},
}[encoding]
except KeyError:
raise EncodingNameError
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of raising this exception, we could try to create an evaluation object and load the encodings from .m files in some special folder, like in WMA. I plan to do that in another round, which would be relatively easy.



def encode_string_value(value: str, encoding: str) -> str:
"""Convert an Unicode string `value` to the required `encoding`"""

# In WMA, encodings are readed from SystemFiles/CharacterEncodings/*.m
# on the fly. We should load them from Mathics3-Scanner tables.
encoding_table = get_encoding_table(encoding)
if not encoding_table:
return value
result = ""
for ch in value:
ch = encoding_table.get(ch, ch)
result += ch
return result
14 changes: 11 additions & 3 deletions mathics/eval/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,23 @@
from mathics.core.expression_predefined import MATHICS3_INFINITY
from mathics.core.list import ListExpression
from mathics.core.symbols import Symbol, SymbolTrue
from mathics.eval.encoding import EncodingNameError
from mathics.format.box import format_element


def eval_ToString(
expr: BaseElement, form: Symbol, encoding: String, evaluation: Evaluation
) -> String:
boxes = format_element(expr, evaluation, form, encoding=encoding)
text = boxes.to_text(evaluation=evaluation)
return String(text)

boxes = format_element(expr, evaluation, form)
Copy link
Copy Markdown
Member

@rocky rocky Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the final idea is that the strings in format_element are going to get converted, then I think this is approaching this the wrong way.

Instead, format_element needs to take the parameters expr, form, and encoding to produce boxes that have the appropriate strings in them initially.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, but this doesn't align with how the experiments I showed you suggest WMA works. It does not matter how you create a string or a Box expression; in the end, an encoding pass is applied. And if you do the conversion earlier, a double conversion spoils the result.
Handling encoding at the level of format_element is like to modify the underlying structure of a Graphics object, because you know in the end it is going to be converted into a PNG file.

Copy link
Copy Markdown
Member

@rocky rocky Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, but this doesn't align with how the experiments I showed you suggest WMA works.

I did not find anywhere in those experiments that there was a string that was encoded one way, and inside ToString, it got reencoded, as opposed to being encoded correctly initially.

It does not matter how you create a string or a Box expression; in the end, an encoding pass is applied.

That is not at issue here. What is at issue here is taking a string that was wrongly encoded and re-encoding it.

Consider this example where I set a breakpoint at the location we are discussing:

$ mathics3
...
In[1]:= ToString[a >= b, CharacterEncoding -> "ASCII"]
(/tmp/Mathics3/mathics-core/mathics/eval/strings.py:30:5 @46): eval_ToString
-- 30     try:
(trepan3k) list
 25    	    expr: BaseElement, form: Symbol, encoding: String, evaluation: Evaluation
 26    	) -> String:
 27    	
 28    	    boxes = format_element(expr, evaluation, form)
 29    	    breakpoint()
 30  ->	    try:
 31    	        return String(boxes.to_text(evaluation=evaluation, encoding=encoding))
 32    	    except EncodingNameError:
 33    	        # Mimic the WMA behavior. In the future, we can implement the mechanism
 34    	        # with encodings stored in .m files, and give a chance with it.
(trepan3k) boxes.elements
(<Expression: <Symbol: System`PaneBox>[<String: ""a ≥ b"">]>, <Expression: <Symbol: ...

<String: ""a ≥ b""> is wrong. That should be <String: ""a >= b"">.

And if you do the conversion earlier, a double conversion spoils the result. Handling encoding at the level of format_element is like to modify the underlying structure of a Graphics object, because you know in the end it is going to be converted into a PNG file.

This is not relevant here. We started with a Mathics3 Expression, and inside format_element, this expression got turned into an incorrect string, because encoding information indicating that strings are supposed to be ASCII was not respected inside format_element.

Another viable solution might be to have format_element not convert the expression a >= b to a String, and leave it as an Expression for later. But, I am not sure that is possible or correct. I believe only that what is done is incorrect and there's no evidence right now that WMA is reencoding strings instead of encoding them correctly initially.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<String: ""a ≥ b""> is wrong. That should be <String: ""a >= b"">.

I have been looking again this, and again, this is a central misunderstanding: as I see this, the line 28

    	    boxes = format_element(expr, evaluation, form)

must return a boxed expression that uses the internal representation (Unicode/UTF-8). Then, the result <String: ""a ≥ b""> is correct. The encoding is applied in line 31

    	        return String(boxes.to_text(evaluation=evaluation, encoding=encoding))

which takes the box expression and converts it into a Python string, in the request encoding.

The advantage of this approach is that all the codepage translation machinary is completely localized in one module. The drawback is that we have to scan each character to see if we need to translate it. But this is how WMA does it, and I guess they developers had very good reasons to do in this way.

try:
return String(boxes.to_text(evaluation=evaluation, encoding=encoding))
except EncodingNameError:
# Mimic the WMA behavior. In the future, we can implement the mechanism
# with encodings stored in .m files, and give a chance with it.
evaluation.message("Get", "noopen", String("encodings/" + encoding + "." + "m"))

return String(boxes.to_text(evaluation=evaluation, encoding="Unicode"))


def eval_StringContainsQ(name, string, patt, evaluation, options, matched):
Expand Down
3 changes: 2 additions & 1 deletion mathics/format/box/numberform.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Real,
String,
)
from mathics.core.convert.op import operator_to_unicode
from mathics.core.element import BaseElement, BoxElementMixin
from mathics.core.evaluation import Evaluation
from mathics.core.expression import Expression
Expand Down Expand Up @@ -49,7 +50,7 @@ def default_numberformat_outputform(man, base, exp, opts):
"ExponentFunction": lambda x: (SymbolNull if abs(x.value) <= 5 else x),
"ExponentStep": 1,
"NumberFormat": default_numberformat_outputform,
"NumberMultiplier": "×",
"NumberMultiplier": operator_to_unicode["Times"],
"NumberPadding": ["", "0"],
"NumberPoint": ".",
"NumberSeparator": [",", ""],
Expand Down
6 changes: 0 additions & 6 deletions mathics/format/form/inputform.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
SymbolRight,
)
from mathics.format.box.formatvalues import do_format # , format_element
from mathics.settings import SYSTEM_CHARACTER_ENCODING

from .util import (
ARITHMETIC_OPERATOR_STRINGS,
Expand Down Expand Up @@ -154,7 +153,6 @@ def _infix_expression_to_inputform_text(
# has a head that matches with a symbol associated to an infix
# operator, WMA builds its inputform without passing through
# its "Infix" form.
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
operands, ops_lst, precedence, group = collect_in_pre_post_arguments(
expr, evaluation, **kwargs
)
Expand Down Expand Up @@ -198,15 +196,13 @@ def _prefix_expression_to_inputform_text(
"""
Convert Prefix[...] into a OutputForm string.
"""
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
operands, op_head, precedence, group = collect_in_pre_post_arguments(
expr, evaluation, **kwargs
)
# Prefix works with just one operand:
if len(operands) != 1:
raise _WrongFormattedExpression
operand = operands[0]
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
target_txt = render_input_form(operand, evaluation, **kwargs)
parenthesized = group in (None, SymbolRight, SymbolNonAssociative)
target_txt = parenthesize(precedence, operand, target_txt, parenthesized)
Expand All @@ -220,7 +216,6 @@ def _postfix_expression_to_inputform_text(
"""
Convert Postfix[...] into a OutputForm string.
"""
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
operands, op_head, precedence, group = collect_in_pre_post_arguments(
expr, evaluation, **kwargs
)
Expand Down Expand Up @@ -295,7 +290,6 @@ def _rule_to_inputform_text(expr, evaluation: Evaluation, **kwargs) -> str:
"""Rule|RuleDelayed[{...}]"""
head = expr.head
elements = expr.elements
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
if len(elements) != 2:
return _generic_to_inputform_text(expr, evaluation, **kwargs)
pat, rule = (render_input_form(elem, evaluation, **kwargs) for elem in elements)
Expand Down
10 changes: 2 additions & 8 deletions mathics/format/form/outputform.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
get_numberform_parameters,
numberform_to_boxes,
)
from mathics.settings import SYSTEM_CHARACTER_ENCODING

from .inputform import render_input_form
from .util import (
Expand Down Expand Up @@ -338,7 +337,7 @@ def other_forms(expr, evaluation, **kwargs):
raise _WrongFormattedExpression

result = format_element(expr, evaluation, SymbolStandardForm, **kwargs)
return result.to_text()
return result.to_text(evaluation=evaluation, **kwargs)


@register_outputform("System`Integer")
Expand All @@ -357,7 +356,7 @@ def integer_outputform(n, evaluation, **kwargs):
result = numberform_to_boxes(n, digits, padding, evaluation, py_options)
if isinstance(result, String):
return result.value
return result.to_text()
return result.to_text(**kwargs)


@register_outputform("System`Image")
Expand All @@ -381,7 +380,6 @@ def _infix_outputform_text(expr: Expression, evaluation: Evaluation, **kwargs) -
# has a head that matches with a symbol associated to an infix
# operator, WMA builds its inputform without passing through
# its "Infix" form.
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
operands, ops_lst, precedence, group = collect_in_pre_post_arguments(
expr, evaluation, **kwargs
)
Expand Down Expand Up @@ -681,7 +679,6 @@ def _prefix_output_text(expr: Expression, evaluation: Evaluation, **kwargs) -> s
if not isinstance(expr.head, Symbol):
raise _WrongFormattedExpression

kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
operands, op_head, precedence, group = collect_in_pre_post_arguments(
expr, evaluation, **kwargs
)
Expand All @@ -691,7 +688,6 @@ def _prefix_output_text(expr: Expression, evaluation: Evaluation, **kwargs) -> s
if not isinstance(op_head, str):
raise _WrongFormattedExpression
operand = operands[0]
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
target_txt = render_output_form(operand, evaluation, **kwargs)
parenthesized = group in (None, SymbolRight, SymbolNonAssociative)
target_txt = parenthesize(precedence, operand, target_txt, parenthesized)
Expand All @@ -706,7 +702,6 @@ def _postfix_output_text(expr: Expression, evaluation: Evaluation, **kwargs) ->
if not isinstance(expr.head, Symbol):
raise _WrongFormattedExpression

kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
operands, op_head, precedence, group = collect_in_pre_post_arguments(
expr, evaluation, **kwargs
)
Expand Down Expand Up @@ -776,7 +771,6 @@ def rule_to_outputform_text(expr, evaluation: Evaluation, **kwargs):
raise _WrongFormattedExpression

elements = expr.elements
kwargs["encoding"] = kwargs.get("encoding", SYSTEM_CHARACTER_ENCODING)
if len(elements) != 2:
return _default_render_output_form(expr, evaluation, **kwargs)
pat, rule = (render_output_form(elem, evaluation, **kwargs) for elem in elements)
Expand Down
Loading
Loading