Search code examples
javascriptnode.jsregexunicodeemoji

Is there a way to check if a string in JS is one single emoji?


The question is simple: I have a string str, how do I check if str is one single emoji, and nothing else? Additionally I would prefer not using another library.

Match "🍎", "⛹🏿‍♂️", "3️⃣" but not "🍓a", "𝕒", "🍌🍀"

I'm having trouble finding a solution but here are some things I've tried so far:


Attempted Solution 1 - Play around lengths and ... operator

I learned that emojis occupy more than one byte, some even occupy 4 bytes, or more... and we can measure that via the string's length property:

console.log("🍎".length); // 2
console.log("🛡️".length); // 3
console.log("⛹🏿‍♂️".length); // 6

Then I found out that the ... operator takes this into account and correctly separates emojis in the array - I could then see the resulting array's length property and detect if they were different.

str = "⛹🏿‍♂️";
if (str.length !== [...str].length) {
  // is emoji?
} else {
  // is not emoji
}

But this doesn't check for other multi-byte characters such as 𝕡 whose length is 2. Plus some emojis were still getting separated in a weird.


Attempted Solution 2 - Regex, regular expressions

Of course regex would be a thing to look into but I've yet to find a viable solution.

This answer's regex \u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff] works perfectly fine to detect if a string has any emojis, but applied to my situation it produces a lot of problems. Here are my tests:

Part A - Without start/end of string regex (^ and $)

  • 2A.1 str.match(regex) is very inconsistent, it breaks down some emojis and some other unusable. I don't see a way to find out if it even contains non-emoji characters or if it contains more than one emoji:
let regex = /(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])/;

console.log("5️⃣".match(regex)); // [ '⃣', '⃣', index: 2, input: '5️⃣' ]
console.log("💡".match(regex)); // [ '💡', '💡', index: 0, input: '💡' ]
console.log("🌡️🌡️".match(regex)); // [ '🌡', '🌡', index: 0, input: '🌡️🌡️' ]
console.log("a⛅".match(regex)); // [ '⛅', '⛅', index: 1, input: 'a⛅' ]
  • 2A.2 regex.test(str) returns true whenever an emoji is included in the string, which isn't the behaviour I'm looking for:
let regex = /(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])/;

console.log(regex.test("5️⃣")); // true - correct
console.log(regex.test("a")); // false - correct
console.log(regex.test("🌡️🌡️")); // true - should be false
console.log(regex.test("hello ⛅!")); // true - should be false

Part B - With start/end of string regex (^ and $)

  • 2B.1 str.match(regex) returns null on certain emojis for some reason. I have no clue why but I'm assuming it has some relation as to why str.match(regex) would break down these emojis in Part A:
let regex = /^(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])$/;

console.log("5️⃣".match(regex)); // null
console.log("💡".match(regex)); // [ '💡', '💡', index: 0, input: '💡' ]
console.log("🌡️".match(regex)); // null
console.log("⛅".match(regex)); // [ '⛅', '⛅', index: 1, input: 'a⛅' ]
console.log("🍌🍀".match(regex)); // null
  • 2B.2 regex.test(str) will return false on the same emojis where it would return null on str.match(regex):
let regex = /^(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])$/;

console.log(regex.test("5️⃣")); // false - should be true
console.log(regex.test("💡")); // true - correct
console.log(regex.test("🌡️")); // false - should be true
console.log(regex.test("⛅")); // true - correct
console.log(regex.test("🍌🍀")); // false - correct

Part C - Other regular expressions

  • I found this one but it gives similar inconsistencies, although not the same /(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff]|[\u0023-\u0039]\ufe0f?\u20e3|\u3299|\u3297|\u303d|\u3030|\u24c2|\ud83c[\udd70-\udd71]|\ud83c[\udd7e-\udd7f]|\ud83c\udd8e|\ud83c[\udd91-\udd9a]|\ud83c[\udde6-\uddff]|[\ud83c[\ude01\uddff]|\ud83c[\ude01-\ude02]|\ud83c\ude1a|\ud83c\ude2f|[\ud83c[\ude32\ude02]|\ud83c\ude1a|\ud83c\ude2f|\ud83c[\ude32-\ude3a]|[\ud83c[\ude50\ude3a]|\ud83c[\ude50-\ude51]|\u203c|\u2049|[\u25aa-\u25ab]|\u25b6|\u25c0|[\u25fb-\u25fe]|\u00a9|\u00ae|\u2122|\u2139|\ud83c\udc04|[\u2600-\u26FF]|\u2b05|\u2b06|\u2b07|\u2b1b|\u2b1c|\u2b50|\u2b55|\u231a|\u231b|\u2328|\u23cf|[\u23e9-\u23f3]|[\u23f8-\u23fa]|\ud83c\udccf|\u2934|\u2935|[\u2190-\u21ff])/g:
let regex = /^(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff]|[\u0023-\u0039]\ufe0f?\u20e3|\u3299|\u3297|\u303d|\u3030|\u24c2|\ud83c[\udd70-\udd71]|\ud83c[\udd7e-\udd7f]|\ud83c\udd8e|\ud83c[\udd91-\udd9a]|\ud83c[\udde6-\uddff]|[\ud83c[\ude01\uddff]|\ud83c[\ude01-\ude02]|\ud83c\ude1a|\ud83c\ude2f|[\ud83c[\ude32\ude02]|\ud83c\ude1a|\ud83c\ude2f|\ud83c[\ude32-\ude3a]|[\ud83c[\ude50\ude3a]|\ud83c[\ude50-\ude51]|\u203c|\u2049|[\u25aa-\u25ab]|\u25b6|\u25c0|[\u25fb-\u25fe]|\u00a9|\u00ae|\u2122|\u2139|\ud83c\udc04|[\u2600-\u26FF]|\u2b05|\u2b06|\u2b07|\u2b1b|\u2b1c|\u2b50|\u2b55|\u231a|\u231b|\u2328|\u23cf|[\u23e9-\u23f3]|[\u23f8-\u23fa]|\ud83c\udccf|\u2934|\u2935|[\u2190-\u21ff])$/g

console.log(regex.test("5️⃣")); // true - correct
console.log(regex.test("💡")); // false - should be true
console.log(regex.test("🌡️")); // false - should be true
console.log(regex.test("⛅")); // true - correct
console.log(regex.test("🍌🍀")); // false - correct
  • Also this breaks horribly (second test changes based on first test?)
let regex = /^(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff]|[\u0023-\u0039]\ufe0f?\u20e3|\u3299|\u3297|\u303d|\u3030|\u24c2|\ud83c[\udd70-\udd71]|\ud83c[\udd7e-\udd7f]|\ud83c\udd8e|\ud83c[\udd91-\udd9a]|\ud83c[\udde6-\uddff]|[\ud83c[\ude01\uddff]|\ud83c[\ude01-\ude02]|\ud83c\ude1a|\ud83c\ude2f|[\ud83c[\ude32\ude02]|\ud83c\ude1a|\ud83c\ude2f|\ud83c[\ude32-\ude3a]|[\ud83c[\ude50\ude3a]|\ud83c[\ude50-\ude51]|\u203c|\u2049|[\u25aa-\u25ab]|\u25b6|\u25c0|[\u25fb-\u25fe]|\u00a9|\u00ae|\u2122|\u2139|\ud83c\udc04|[\u2600-\u26FF]|\u2b05|\u2b06|\u2b07|\u2b1b|\u2b1c|\u2b50|\u2b55|\u231a|\u231b|\u2328|\u23cf|[\u23e9-\u23f3]|[\u23f8-\u23fa]|\ud83c\udccf|\u2934|\u2935|[\u2190-\u21ff])$/g

console.log(regex.test("⛹🏿‍♂️")); // false
console.log(regex.test("⛅")); // true
let regex = /^(?:[\u2700-\u27bf]|(?:\ud83c[\udde6-\uddff]){2}|[\ud800-\udbff][\udc00-\udfff]|[\u0023-\u0039]\ufe0f?\u20e3|\u3299|\u3297|\u303d|\u3030|\u24c2|\ud83c[\udd70-\udd71]|\ud83c[\udd7e-\udd7f]|\ud83c\udd8e|\ud83c[\udd91-\udd9a]|\ud83c[\udde6-\uddff]|[\ud83c[\ude01\uddff]|\ud83c[\ude01-\ude02]|\ud83c\ude1a|\ud83c\ude2f|[\ud83c[\ude32\ude02]|\ud83c\ude1a|\ud83c\ude2f|\ud83c[\ude32-\ude3a]|[\ud83c[\ude50\ude3a]|\ud83c[\ude50-\ude51]|\u203c|\u2049|[\u25aa-\u25ab]|\u25b6|\u25c0|[\u25fb-\u25fe]|\u00a9|\u00ae|\u2122|\u2139|\ud83c\udc04|[\u2600-\u26FF]|\u2b05|\u2b06|\u2b07|\u2b1b|\u2b1c|\u2b50|\u2b55|\u231a|\u231b|\u2328|\u23cf|[\u23e9-\u23f3]|[\u23f8-\u23fa]|\ud83c\udccf|\u2934|\u2935|[\u2190-\u21ff])$/g;

console.log(regex.test("⛹")); // true
console.log(regex.test("⛅")); // false

Is there a way around all this emoji/unicode/regex mess? Are libraries/apis the only way? How do they do it?


Solution

  • Using this library: https://github.com/foliojs/grapheme-breaker try this:

    var str = "⛹🏿‍♂️";
    var Grapheme = require('grapheme-splitter');
    var splitter = new Grapheme();
    console.log(splitter.splitGraphemes(str).length);
    

    and the length should return 1.