Search code examples
c++harfbuzz

How to get unicode codepoint in harfbuzz after calling hb_shape?


glyph_info[0].codepoint gives the unicode codepoint at the begining. then it gives glyph index after hb_shape.

isn't it supposed to give Unicode codepoint after I set the buffer content type to HB_BUFFER_CONTENT_TYPE_UNICODE? but I still get glyph index!

this is my code:

#include <iostream>
#include <harfbuzz/hb.h>

int main()
{
    hb_buffer_t *buf;
    buf = hb_buffer_create();
    hb_buffer_add_utf8(buf, "A", -1, 0, -1);

    hb_buffer_set_direction(buf, HB_DIRECTION_LTR);
    hb_buffer_set_script(buf, HB_SCRIPT_LATIN);
    hb_buffer_set_language(buf, hb_language_from_string("en", -1));

    hb_blob_t *blob = hb_blob_create_from_file("ARIALUNI.TTF");
    hb_face_t *face = hb_face_create(blob, 0);
    hb_font_t *font = hb_font_create(face);

    hb_buffer_content_type_t t1 = hb_buffer_get_content_type(buf);

    unsigned int glyph_count;
    hb_glyph_info_t *glyph_info = hb_buffer_get_glyph_infos(buf, &glyph_count);
    hb_glyph_position_t *glyph_pos = hb_buffer_get_glyph_positions(buf, &glyph_count);

    printf("Codepoint befor Shaping: %d\n", t1);
    printf("%x\n", glyph_info[0].codepoint);

    hb_shape(font, buf, NULL, 0);

    hb_buffer_content_type_t t2 = hb_buffer_get_content_type(buf);

    glyph_info = hb_buffer_get_glyph_infos(buf, &glyph_count);
    glyph_pos = hb_buffer_get_glyph_positions(buf, &glyph_count);

    printf("Codepoint after Shaping: %d\n", t2);
    printf("%x\n", glyph_info[0].codepoint);

    hb_buffer_set_content_type(buf, HB_BUFFER_CONTENT_TYPE_UNICODE);
    hb_buffer_content_type_t t3 = hb_buffer_get_content_type(buf);

    glyph_info = hb_buffer_get_glyph_infos(buf, &glyph_count);
    glyph_pos = hb_buffer_get_glyph_positions(buf, &glyph_count);

    printf("Codepoint after setting content type: %d\n", t3);
    printf("%x\n", glyph_info[0].codepoint);

    hb_buffer_destroy(buf);
    hb_font_destroy(font);
    hb_face_destroy(face);
    hb_blob_destroy(blob);
}

but this is my out put:

Codepoint befor Shaping: 1
41
Codepoint after Shaping: 2
24
Codepoint after setting content type: 1
24

why the last code point is still glyph index and not Unicode codepoint?

I expected unicode codepoint but i get glyph index


Solution

  • You can use clusters to map shaped glyphs to its source codepoint. What you can do is keep track of codepoints/clusters being passed to HarfBuzz, then compare glyph_info list against that list to see which codepoint(s) each glyph was shaped from

    // NOTE: hb_buffer_get_glyph_infos returns buf->info; so we need to copy its contents before its modified from hb_shape (ty @user3061694 for pointing out)
    unsigned int glyph_count_before;
    hb_glyph_info_t* glyph_infos = hb_buffer_get_glyph_infos(buf, &glyph_count_before);
    unsigned int glyph_infos_before_len = sizeof(hb_glyph_info_t) * glyph_count_before;
    hb_glyph_info_t* glyph_infos_before = (hb_glyph_info_t*)malloc(glyph_infos_before_len );
    memcpy(glyph_infos_before, glyph_infos, len);
    
    hb_shape(font, buf, NULL, 0);
    
    unsigned int glyph_count_after;
    hb_glyph_info_t* glyph_infos_after = hb_buffer_get_glyph_infos(buf, &glyph_count_after);
    
    int source_idx = 0;
    for (int i = 0; i < glyph_count_after; ++i)
    {
        hb_glyph_info_t& glyph_info_after = glyph_infos_after[i];
        printf("Codepoints for glyph %d\n", i);
        
        // find the next grapheme cluster index
        int next_grapheme_cluster_idx = 0;
        for (next_grapheme_cluster_idx = i; next_grapheme_cluster_idx < glyph_count_after; ++next_grapheme_cluster_idx)
        {
            if (glyph_infos_after[next_grapheme_cluster_idx].cluster != glyph_info_after.cluster)
            {
                break;
            }
        }
        
        for (; source_idx < glyph_count_before; ++source_idx)
        {
            hb_glyph_info_t& glyph_info_before = glyph_infos_before[source_idx];
            
            if (next_grapheme_cluster_idx < glyph_count_after && glyph_info_before.cluster > glyph_infos_after[next_grapheme_cluster_idx].cluster)
            {
                // source_idx is now on the codepoint for the next glyph
                break;
            }
            
            printf("  %d\n", glyph_info_before.codepoint);
        }
    }
    
    free(glyph_infos_before);
    

    Looking at the comment on clusters from hb_glyph_info_t

    /**
     * hb_glyph_info_t:
     * @codepoint: either a Unicode code point (before shaping) or a glyph index
     *             (after shaping).
     * @mask: 
     * @cluster: the index of the character in the original text that corresponds
     *           to this #hb_glyph_info_t, or whatever the client passes to
     *           hb_buffer_add(). More than one #hb_glyph_info_t can have the same
     *           @cluster value, if they resulted from the same character (e.g. one
     *           to many glyph substitution), and when more than one character gets
     *           merged in the same glyph (e.g. many to one glyph substitution) the
     *           #hb_glyph_info_t will have the smallest cluster value of them.
     *           By default some characters are merged into the same cluster
     *           (e.g. combining marks have the same cluster as their bases)
     *           even if they are separate glyphs, hb_buffer_set_cluster_level()
     *           allow selecting more fine-grained cluster handling.
    

    Also worth noting that there's various ways to setup clusters for better mapping choices (the default is level 0): HarfBuzz clusters