Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ pipeline {
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-02-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-23-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-23-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
84 changes: 58 additions & 26 deletions nemo_text_processing/text_normalization/es/taggers/electronic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,14 +11,29 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_SPACE,
GraphFst,
at,
colon,
domain_string,
double_quotes,
double_slash,
http,
https,
protocol_string,
username_string,
www,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path, load_labels

common_domains = [x[0] for x in load_labels(get_abs_path("data/electronic/domain.tsv"))]
symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]


class ElectronicFst(GraphFst):
Expand All @@ -35,40 +50,57 @@ class ElectronicFst(GraphFst):
def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="classify", deterministic=deterministic)

dot = pynini.accep(".")
accepted_common_domains = pynini.union(*common_domains)
accepted_symbols = pynini.union(*symbols) - dot
accepted_characters = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols)
acceepted_characters_with_dot = pynini.closure(NEMO_ALPHA | NEMO_DIGIT | accepted_symbols | dot)
full_stop_accep = pynini.accep(".")
full_stop = "."

symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))]
symbols = pynini.union(*symbols)
symbols_no_full_stop = pynini.difference(symbols, full_stop_accep)
accepted_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols_no_full_stop), 1)
all_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols), 1)

# domains
domain = full_stop_accep + accepted_characters
domain_graph = (
pynutil.insert(domain_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters + pynini.closure(domain, 1))
+ pynutil.insert(double_quotes)
)

# email
username = (
pynutil.insert("username: \"")
+ acceepted_characters_with_dot
+ pynutil.insert("\"")
+ pynini.cross('@', ' ')
pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ all_characters
+ pynutil.insert(double_quotes)
+ pynini.cross(at, NEMO_SPACE)
)
domain_graph = accepted_characters + dot + accepted_characters
domain_graph = pynutil.insert("domain: \"") + domain_graph + pynutil.insert("\"")
domain_common_graph = (
pynutil.insert("domain: \"")
+ accepted_characters
+ accepted_common_domains
+ pynini.closure((accepted_symbols | dot) + pynini.closure(accepted_characters, 1), 0, 1)
+ pynutil.insert("\"")
email = username + domain_graph

# social media tags
tag = (
pynini.cross(at, "")
+ pynutil.insert(username_string + colon + NEMO_SPACE + double_quotes)
+ (accepted_characters | (accepted_characters + pynini.closure(domain, 1)))
+ pynutil.insert(double_quotes)
)
graph = (username + domain_graph) | domain_common_graph

# url
protocol_start = pynini.accep("https://") | pynini.accep("http://")
protocol_start = pynini.accep(https + colon + double_slash) | pynini.accep(http + colon + double_slash)
# protocol_end = pynini.accep("www.")
protocol_end = (
pynini.accep("www.")
pynini.accep(www + full_stop)
if deterministic
else pynini.accep("www.") | pynini.cross("www.", "doble ve doble ve doble ve.")
else pynini.accep(www + full_stop) | pynini.cross(www + full_stop, "doble ve doble ve doble ve.")
)
protocol = protocol_start | protocol_end | (protocol_start + protocol_end)
protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"")
graph |= protocol + insert_space + (domain_graph | domain_common_graph)
protocol = (
pynutil.insert(protocol_string + colon + NEMO_SPACE + double_quotes)
+ protocol
+ pynutil.insert(double_quotes)
)
url = protocol + pynutil.insert(NEMO_SPACE) + (domain_graph)

graph = url | domain_graph | email | tag
self.graph = graph

final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true"))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -69,7 +69,7 @@ def __init__(
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far"
cache_dir, f"_{input_case}_es_tn_{deterministic}_deterministic{whitelist_file}.far",
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
Expand All @@ -86,10 +86,10 @@ def __init__(
self.decimal = DecimalFst(cardinal=self.cardinal, deterministic=deterministic)
decimal_graph = self.decimal.fst

self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic)
self.fraction = FractionFst(cardinal=self.cardinal, ordinal=self.ordinal, deterministic=deterministic,)
fraction_graph = self.fraction.fst
self.measure = MeasureFst(
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic
cardinal=self.cardinal, decimal=self.decimal, fraction=self.fraction, deterministic=deterministic,
)
measure_graph = self.measure.fst
self.date = DateFst(cardinal=self.cardinal, deterministic=deterministic)
Expand All @@ -101,7 +101,7 @@ def __init__(
telephone_graph = self.telephone.fst
self.electronic = ElectronicFst(deterministic=deterministic)
electronic_graph = self.electronic.fst
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic)
self.money = MoneyFst(cardinal=self.cardinal, decimal=self.decimal, deterministic=deterministic,)
money_graph = self.money.fst
self.whitelist = WhiteListFst(input_case=input_case, deterministic=deterministic, input_file=whitelist)
whitelist_graph = self.whitelist.fst
Expand All @@ -118,7 +118,7 @@ def __init__(
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.09)
| pynutil.add_weight(telephone_graph, 1.11)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.11)
| pynutil.add_weight(word_graph, 200)
)
punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -17,9 +17,14 @@
from nemo_text_processing.text_normalization.en.graph_utils import (
NEMO_NOT_QUOTE,
NEMO_SIGMA,
NEMO_SPACE,
GraphFst,
colon,
delete_preserve_order,
insert_space,
domain_string,
double_quotes,
protocol_string,
username_string,
)
from nemo_text_processing.text_normalization.es.utils import get_abs_path

Expand All @@ -29,6 +34,7 @@
graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv"))
server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
arroba = "arroba"


class ElectronicFst(GraphFst):
Expand All @@ -45,34 +51,42 @@ class ElectronicFst(GraphFst):

def __init__(self, deterministic: bool = True):
super().__init__(name="electronic", kind="verbalize", deterministic=deterministic)

graph_digit_no_zero = (
digit_no_zero @ pynini.cdrewrite(pynini.cross("un", "uno"), "", "", NEMO_SIGMA).optimize()
)
graph_digit = graph_digit_no_zero | zero

def add_space_after_char():
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(" ") + insert_space) + (
NEMO_NOT_QUOTE - pynini.accep(" ")
return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE) + pynutil.insert(NEMO_SPACE)) + (
NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE)
)

verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA)

user_name = pynutil.delete("username: \"") + add_space_after_char() + pynutil.delete("\"")
user_name = (
pynutil.delete(username_string + colon + NEMO_SPACE + double_quotes)
+ add_space_after_char()
+ pynutil.delete(double_quotes)
)
user_name @= verbalize_characters

convert_defaults = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common
domain = convert_defaults + pynini.closure(insert_space + convert_defaults)
domain = convert_defaults + pynini.closure(pynutil.insert(NEMO_SPACE) + convert_defaults)
domain @= verbalize_characters

domain = pynutil.delete("domain: \"") + domain + pynutil.delete("\"")
domain = (
pynutil.delete(domain_string + colon + NEMO_SPACE + double_quotes) + domain + pynutil.delete(double_quotes)
)
protocol = (
pynutil.delete("protocol: \"")
pynutil.delete(protocol_string + colon + NEMO_SPACE + double_quotes)
+ add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA)
+ pynutil.delete("\"")
+ pynutil.delete(double_quotes)
)
self.graph = (pynini.closure(protocol + pynini.accep(" "), 0, 1) + domain) | (
user_name + pynini.accep(" ") + pynutil.insert("arroba ") + domain

self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | (
user_name + NEMO_SPACE + pynutil.insert(arroba + NEMO_SPACE) + domain
| (pynutil.insert(arroba + NEMO_SPACE) + user_name)
)

delete_tokens = self.delete_tokens(self.graph + delete_preserve_order)
self.fst = delete_tokens.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,11 @@ www.abc.es~w w w punto a b c punto e s
http://www.ourdailynews.com.sm~h t t p dos puntos barra barra w w w punto o u r d a i l y n e w s punto com punto s m
nvidia.com/1|2~nvidia punto com barra uno barra vertical dos
nvidia.com/2^~nvidia punto com barra dos acento circunflejo
www.unicef.org~w w w punto u n i c e f punto o r g
www.unicef.org~w w w punto u n i c e f punto o r g
brettspielversand.de.~b r e t t s p i e l v e r s a n d punto d e .
www.enveedya.net.~w w w punto e n v e e d y a punto net .
www.amazon.com.de.~w w w punto a m a z o n punto com punto d e .
https://www.abc.com~h t t p s dos puntos barra barra w w w punto a b c punto com
@jensen~arroba j e n s e n
@jensen.me~arroba j e n s e n punto m e
@wezyr1986~arroba w e z y r uno nueve ocho seis
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
+1 (123) 123-5678~más uno uno dos tres uno dos tres cinco seis siete ocho
1-800-go-u-haul~uno ochocientos go u haul
llámame al 123-123-5678~llámame al uno dos tres uno dos tres cinco seis siete ocho
123.123.0.40~doce tres punto uno dos tres punto cero punto cuatro cero~uno dos tres punto uno dos tres punto cero punto cuatro cero
la dirección de ip es 123.123.0.40~la dirección de i p es uno dos tres punto uno dos tres punto cero punto cuatro cero
0-800-hermano~cero ochocientos hermano
+58 (123) 123-5678 ext. 12~más cincuenta y ocho uno dos tres uno dos tres cinco seis siete ocho extensión uno dos
+58 (123) 123-5678-12~más cincuenta y ocho uno dos tres uno dos tres cinco seis siete ocho extensión uno dos
+54 911 1234-5678~más cincuenta y cuatro nueve uno uno uno dos tres cuatro cinco seis siete ocho
+54 911 1234-5678~más cincuenta y cuatro nueve uno uno uno dos tres cuatro cinco seis siete ocho
123.123.0.40~doce tres punto uno dos tres punto cero punto cuatro cero~uno dos tres punto uno dos tres punto cero punto cuatro cero
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ el dr.~el doctor
sr. rodriguez~señor rodriguez
182 esq. toledo~ciento ochenta y dos esquina toledo
da. ana pérez~doña ana pérez
ee. uu.~estados unidos
pza.~plaza
c.º b.~camino bajo
c.º b.~camino bajo
ee. uu.~estados unidos
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /bin/sh

GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"}
PROJECT_DIR=${2:-"/workspace/tests/en"}
PROJECT_DIR=${2:-"/workspace/tests/"}

runtest () {
input=$1
Expand Down