Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclude punctuation from character count for Japanese texts #22050

Open
wants to merge 7 commits into
base: trunk
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ const expectedResults = {
textLength: {
isApplicable: true,
score: 9,
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3022 characters. Good job!",
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 2734 characters. Good job!",
},
externalLinks: {
isApplicable: true,
Expand Down Expand Up @@ -110,21 +110,21 @@ const expectedResults = {
},
subheadingsTooLong: {
isApplicable: true,
score: 3,
score: 6,
resultText: "<a href='https://yoa.st/34x' target='_blank'>Subheading distribution</a>: " +
"2 sections of your text are longer than the recommended number of characters (600) and are not separated by any subheadings. " +
"1 section of your text is longer than the recommended number of characters (600) and is not separated by any subheadings. " +
"<a href='https://yoa.st/34y' target='_blank'>Add subheadings to improve readability</a>.",
},
textParagraphTooLong: {
isApplicable: true,
score: 3,
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 4 of the paragraphs contain more " +
resultText: "<a href='https://yoa.st/35d' target='_blank'>Paragraph length</a>: 3 of the paragraphs contain more " +
"than the recommended maximum number of characters (300). <a href='https://yoa.st/35e' target='_blank'>Shorten your paragraphs</a>!",
},
textSentenceLength: {
isApplicable: true,
score: 3,
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 51.6% of the sentences contain more than 40 characters, " +
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 46.8% of the sentences contain more than 40 characters, " +
"which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
},
textTransitionWords: {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import textLength from "../../../../../src/languageProcessing/languages/ja/customResearches/textLength";
import Paper from "../../../../../src/values/Paper";

describe( "counts characters in a string", function() {
const paper = new Paper( "こんにちは。" );
describe( "counts character length of a Japanese text (punctuation and spaces are excluded)", function() {
const paper = new Paper( "「黒猫」(くろねこ、The Black Cat)は、1843年に 発表されたエドガー・アラン・ポーの短編小説。" );

it( "returns the number of characters for the text of a given paper", function() {
expect( textLength( paper ) ).toEqual( { text: "こんにちは。", count: 6, unit: "character" } );
expect( textLength( paper ) ).toEqual( { text: "「黒猫」(くろねこ、The Black Cat)は、1843年に 発表されたエドガー・アラン・ポーの短編小説。",
count: 43, unit: "character" } );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@ describe( "counts characters in a string", function() {
it( "returns the number of characters", function() {
expect( countCharactersFunction( "これに対し日本国有鉄道(国鉄)は、十河信二国鉄総裁と技師長の島秀雄の下、" +
"高速運転が可能な標準軌新線を建設することを決定。1959年(昭和34年)4月20日、新丹那トンネル熱海口で起工式を行って着工し、" +
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 124 );
} );
it( "returns the number of characters not including URL characters in the count", function() {
expect( countCharactersFunction( "www.yoast.comこれに対し日本国有鉄道(国鉄)は、十河信二国鉄総裁と技師長の島秀雄の下、" +
"高速運転が可能な標準軌新線を建設することを決定。1959年(昭和34年)4月20日、新丹那トンネル熱海口で起工式を行って着工し、" +
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 124 );
} );
it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
expect( countCharactersFunction( "this is a string" ) ).toBe( 13 );
expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
"(представляващи краен брой знаци)." ) ).toBe( 78 );
"(представляващи краен брой знаци)." ) ).toBe( 75 );
} );
it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
const text = "<!-- wp:embed {\"url\":\"https://www.youtube.com/watch?v=cbP2N1BQdYc\",\"type\":\"video\"," +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,56 +67,47 @@ describe( "counts words in sentences from text", function() {
expect( sentences[ 0 ].sentenceLength ).toBe( 4 );
expect( sentences[ 1 ].sentenceLength ).toBe( 2 );
} );
it( "returns sentences with question mark in Japanese", function() {
const mockPaper = new Paper( "雨が降っている。 いつ終わるの? さようなら" );
it( "returns Japanese sentences ending with different punctuation marks; character count doesn't include punctuation", function() {
const mockPaper = new Paper( "雨が降っている。いつ終わるの?わかった!さようなら" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 8 );
expect( sentences[ 1 ].sentenceLength ).toBe( 7 );
expect( sentences[ 2 ].sentenceLength ).toBe( 5 );
} );
it( "returns sentences with exclamation mark", function() {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was combined with the one above.

const mockPaper = new Paper( "雨が降っている. いつ終わるの!さようなら" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 8 );
expect( sentences[ 1 ].sentenceLength ).toBe( 7 );
expect( sentences[ 2 ].sentenceLength ).toBe( 5 );
expect( sentences[ 0 ].sentenceLength ).toBe( 7 );
expect( sentences[ 1 ].sentenceLength ).toBe( 6 );
expect( sentences[ 2 ].sentenceLength ).toBe( 4 );
expect( sentences[ 3 ].sentenceLength ).toBe( 5 );
} );
it( "returns sentences with many spaces", function() {
const mockPaper = new Paper( "雨が降っている。 いつ終わるの? さようなら" );
it( "returns Japanese sentences with many spaces (2nd sentence with half-width, 1st and 3rd sentence with fullwidth spaces", function() {
const mockPaper = new Paper( "雨 が 降っている。 いつ終わるの? さようなら " );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 8 );
expect( sentences[ 1 ].sentenceLength ).toBe( 7 );
expect( sentences[ 0 ].sentenceLength ).toBe( 7 );
expect( sentences[ 1 ].sentenceLength ).toBe( 6 );
expect( sentences[ 2 ].sentenceLength ).toBe( 5 );
} );
it( "returns sentences with html-tags, should count characters in Japanese", function() {
const mockPaper = new Paper( "いつ終わるの <img src='image.jpg' alt='自分を大事にして下さい' />" );
it( "returns Japanese sentence with an HTML img tag; character count excludes the HTML and the text within", function() {
const mockPaper = new Paper( "いつ終わるの <img src='http://domain.com/image.jpg' alt='自分を大事にして下さい' />?!" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 6 );
} );
it( "returns sentences with html-tags, should count characters in Japanese", function() {
const mockPaper = new Paper( "いつ終わるの <img src='http://domain.com/image.jpg' alt='自分を大事にして下さい' />. 春がやってきます。" );
it( "returns Japanese sentence with an embedded video; character count excludes the HTML", function() {
const mockPaper = new Paper( "いつ終わるの<iframe width=\"420\" height=\"315\" " +
"src=\"https://www.youtube.com/embed/tgbNymZ7vqY\"></iframe>。春がやってきます。" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const sentences = getSentences( mockPaper, mockResearcher );

expect( sentences[ 0 ].sentenceLength ).toBe( 7 );
expect( sentences[ 1 ].sentenceLength ).toBe( 9 );
expect( sentences[ 0 ].sentenceLength ).toBe( 6 );
expect( sentences[ 1 ].sentenceLength ).toBe( 8 );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ describe( "a test for getting paragraph length", function() {
} );

it( "returns the paragraph length of a paragraph in Japanese between p tags", function() {
const mockPaper = new Paper( "<p>これに対し日本国有鉄道</p>" );
const mockPaper = new Paper( "<p>これに対し日本国有鉄道</p>" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

Expand Down Expand Up @@ -54,7 +54,7 @@ describe( "a test for getting paragraph length", function() {

const paragraphLengths = getParagraphLength( mockPaper, mockResearcher );

expect( paragraphLengths[ 0 ].paragraphLength ).toBe( 36 );
expect( paragraphLengths[ 0 ].paragraphLength ).toBe( 33 );
} );

it( "returns the paragraph length of 2 paragraphs, both between p tags", function() {
Expand All @@ -75,7 +75,7 @@ describe( "a test for getting paragraph length", function() {
const paragraphLengths = getParagraphLength( mockPaper, mockResearcher );

expect( paragraphLengths[ 0 ].paragraphLength ).toBe( 18 );
expect( paragraphLengths[ 1 ].paragraphLength ).toBe( 18 );
expect( paragraphLengths[ 1 ].paragraphLength ).toBe( 15 );
} );

it( "returns the paragraph length of 2 paragraphs, both between p tags, divided by double linebreaks", function() {
Expand All @@ -97,7 +97,7 @@ describe( "a test for getting paragraph length", function() {
const paragraphLengths = getParagraphLength( mockPaper, mockResearcher );

expect( paragraphLengths[ 0 ].paragraphLength ).toBe( 18 );
expect( paragraphLengths[ 1 ].paragraphLength ).toBe( 18 );
expect( paragraphLengths[ 1 ].paragraphLength ).toBe( 15 );
} );

it( "returns the paragraph length, with empty paragraphs", function() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ describe( "gets the length of text segments expressed in characters " +
// Check the content and length of each individual text segment.
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].subheading ).toBe( "<h1>タイトル</h1>" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].text ).toBe( "文章です。" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].countLength ).toBe( 5 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].countLength ).toBe( 4 );
} );

it( "returns an array with one text segment for a text without subheadings", function() {
Expand All @@ -109,10 +109,10 @@ describe( "gets the length of text segments expressed in characters " +
// Check the content and length of each individual text segment.
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].subheading ).toBe( "<h2>犬</h2>" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].text ).toBe( "犬はかわいいです。" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].countLength ).toBe( 9 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].countLength ).toBe( 8 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].subheading ).toBe( "<h3>子犬</h3>" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].text ).toBe( "子犬が特にかわいいです。" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].countLength ).toBe( 12 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].countLength ).toBe( 11 );
} );

it( "returns an array with 3 entries for a text with two subheadings, two text segments, and one introductory segment", function() {
Expand All @@ -122,12 +122,12 @@ describe( "gets the length of text segments expressed in characters " +
// Check the length of each individual text segment.
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].subheading ).toBe( "" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].text ).toBe( "トピックは犬です。" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].countLength ).toBe( 9 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 0 ].countLength ).toBe( 8 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].subheading ).toBe( "<h2>犬</h2>" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].text ).toBe( "犬はかわいいです。" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].countLength ).toBe( 9 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 1 ].countLength ).toBe( 8 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 2 ].subheading ).toBe( "<h3>子犬</h3>" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 2 ].text ).toBe( "子犬が特にかわいいです。" );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 2 ].countLength ).toBe( 12 );
expect( foundSubheadingsTextLength( mockPaper, japaneseResearcher )[ 2 ].countLength ).toBe( 11 );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ describe( "the keyphrase length research", function() {
} );

describe( "the keyphrase length research", function() {
it( "should count the words in the input and filters function words", function() {
it( "should count the words in the input and filter function words", function() {
const paper = new Paper( "", { keyword: "word word the word" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
Expand All @@ -42,7 +42,7 @@ Describe( "the keyphrase length research", function() {
} );
*/

describe( "the keyphrase length research for empty keyword", function() {
describe( "the keyphrase length research for an empty keyword", function() {
it( "should count the words in the input", function() {
const paper = new Paper( "", { keyword: "" } );
const researcher = new EnglishResearcher( paper );
Expand All @@ -53,3 +53,4 @@ describe( "the keyphrase length research for empty keyword", function() {
expect( result.keyphraseLength ).toBe( 0 );
} );
} );

Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import RussianResearcher from "../../../../src/languageProcessing/languages/ru/R
import ItalianResearcher from "../../../../src/languageProcessing/languages/it/Researcher";
import TurkishResearcher from "../../../../src/languageProcessing/languages/tr/Researcher";
import japaneseConfig from "../../../../src/languageProcessing/languages/ja/config/sentenceLength";
import JapaneseResearcher from "../../../../src/languageProcessing/languages/ja/Researcher";

const shortSentenceDefault = "Word ".repeat( 18 ) + "word. ";
const longSentenceDefault = "Word ".repeat( 20 ) + "word. ";
Expand Down Expand Up @@ -279,11 +280,15 @@ describe( "An assessment for sentence length", function() {
expect( assessment.hasMarks() ).toBe( true );
} );

/* it( "returns the score for 100% short sentences in Japanese", function() {
const mockPaper = new Paper( "" );
const assessment = new SentenceLengthInTextAssessment().getResult( mockPaper, Factory.buildMockResearcher( [
{ sentence: "", sentenceLength: 39 },
], false, false, japaneseConfig ) );
it( "returns the score for 100% short sentences in a language that should count sentence length in characters (Japanese)", function() {
const mockPaper = new Paper( "日本語では、完了の助動詞としては「つ」「ぬ」「たり」「り」が用いられた(「てけり」「にき」などの過去完了形も)。" );
const mockResearcher = new JapaneseResearcher( mockPaper );
buildTree( mockPaper, mockResearcher );

const assessment = new SentenceLengthInTextAssessment( {
slightlyTooMany: 25,
farTooMany: 30,
}, false, false ).getResult( mockPaper, mockResearcher );

expect( assessment.hasScore() ).toBe( true );
expect( assessment.getScore() ).toEqual( 9 );
Expand All @@ -304,7 +309,7 @@ describe( "An assessment for sentence length", function() {
expect( assessment.getScore() ).toEqual( 9 );
expect( assessment.getText() ).toEqual( "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: Great!" );
expect( assessment.hasMarks() ).toBe( true );
} );*/
} );

it( "is not applicable for empty papers", function() {
const mockPaper = new Paper();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ describe( "A text length assessment", function() {
} );

describe( "Tests regular post content type in Japanese. " +
"The score should use language-specific boundaries and should be based on character length.", function() {
const character = "";
"The score should use language-specific boundaries and should be based on character length (punctuation excluded).", function() {
const character = "(あ)。";
const textVeryFarBelowMinimum = character.repeat( 199 );
const textFarBelowMinimum = character.repeat( 399 );
const textBelowMinimum = character.repeat( 499 );
Expand Down
Loading
Loading