I have a text file (some 10,000 lines) some of the lines are given below
confusables.txt
1F110 ; 0028 0041 0029 ; MA #* ( 🄐 → (A) ) PARENTHESIZED LATIN CAPITAL LETTER A → LEFT PARENTHESIS, LATIN CAPITAL LETTER A, RIGHT PARENTHESIS #
FF21 ; 0041 ; MA # ( A → A ) FULLWIDTH LATIN CAPITAL LETTER A → LATIN CAPITAL LETTER A # →А→
FF22 ; 0042 ; MA # ( B → B ) FULLWIDTH LATIN CAPITAL LETTER B → LATIN CAPITAL LETTER B # →Β→
212C ; 0042 ; MA # ( ℬ → B ) SCRIPT CAPITAL B → LATIN CAPITAL LETTER B #
1F110 ; 0028 0041 0029 ; MA #* ( 🄐 → (A) ) PARENTHESIZED LATIN CAPITAL LETTER A → LEFT PARENTHESIS, LATIN CAPITAL LETTER A, RIGHT PARENTHESIS #
1D435 ; 0042 ; MA # ( 𝐵 → B ) MATHEMATICAL ITALIC CAPITAL B → LATIN CAPITAL LETTER B #
213B ; 0046 0041 0058 ; MA #* ( ℻ → FAX ) FACSIMILE SIGN → LATIN CAPITAL LETTER F, LATIN CAPITAL LETTER A, LATIN CAPITAL LETTER X #
I want to get the first character after parenthesis (Unicode, original char) in every line based on a search string (i.e. ℬ against 'LATIN CAPITAL LETTER B' in line 4 above), I can do this using the following code
<?php
/**
* @return Generator
*/
// read file
$fileData = function () {
$file = fopen(__DIR__ . './confusables.txt', 'r');
if (!$file) {
return;
}
while (($line = fgets($file)) !== false) {
yield $line;
}
fclose($file);
};
// output array
$output_string = [
'uni-code' => '',
'original' => '',
'des' => '',
];
$search_string = 'LATIN CAPITAL LETTER A';
$initial_line_count = 1; // variable to count lines before we start slicing
$final_count = 0; // final line count
// loop to get final count
foreach ($fileData() as $line) {
// $line contains current line
if (preg_match_all("/{$search_string}/i", $line)) {
$initial_line_count++;
$final_count = $initial_line_count;
// echo $final_count.'<br>';
}
}
$line_count = 1; // loop termination counter
$html = '<table>
<tr>
<th style="border:1px solid #000">ORIGINAL LETTERS</th>
<th style="border:1px solid #000">UNICODE CHARACTER</th>
<th style="border:1px solid #000">Description</th>
</tr>';
// loop to slice and append in array
foreach ($fileData() as $line) {
// $line contains current line
if (preg_match_all("/{$search_string}/i", $line)) {
// start slicing
$slice_after = substr($line, 0, strpos($line, ' ) ')); // slice everything after )
$slice_before = ltrim(stristr($slice_after, '('), '('); // slice everything upto (
$first_char = substr($slice_before, 0, strpos($slice_before, "→")); // get every first character
$split_Real_char = ltrim(stristr($search_string, 'LETTER'), 'LETTER'); // get every real character
$real_Char = $output_string['original'] .= $split_Real_char; // append to array
$split_Unicode_char = $output_string['uni-code'] .= $first_char . ','; // append to array
$line_count++; // loop termination counter
// loop termination
if ($line_count == $final_count) {
$html .= ' <tr>
<td style=" border:1px solid black;"><pre>' . $split_Real_char . '</pre></td>
<td style=" border:1px solid black;"><pre>' . $split_Unicode_char . '</pre></td>
<td style=" border:1px solid black;"><pre>' . $search_string . '</pre></td>
</tr>';
$html .= '</table>';
echo $html;
break;
}
}
}
and I get output such as
| ORIGINAL LETTER | UNICODE CHARACTER | Description |
| -------------------- | ------------------------- | -------------------------------- |
| B | B, ℬ , 𝐵 | LATIN CAPITAL LETTER B |
The output looks fine for a single (hard coded string) but I have to automate the process (for the entire 10,000 lines), so far I have tried
<?php
/**
* @return Generator
*/
// read file
$fileData = function () {
$file = fopen(__DIR__ . './confusables.txt', 'r');
if (!$file) {
return;
}
while (($line = fgets($file)) !== false) {
yield $line;
}
fclose($file);
};
$searchStringArray = array();
// loop to generate search strings
foreach (range('A', 'B') as $alphabet) {
$alphabets = 'LATIN CAPITAL LETTER ' . $alphabet . "";
array_push($searchStringArray, $alphabets);
}
// output array
$output_string = [
'uni-code' => '',
'original' => '',
'des' => '',
];
$initial_line_count = 1; // variable to count lines before we start slicing
$final_count = 0; // final line count
for ($i = 0; $i < count($searchStringArray); $i++) {
$search_string = $searchStringArray[$i];
// loop to get final count
foreach ($fileData() as $line) {
// $line contains current line
if (preg_match_all("/{$search_string}/i", $line)) {
$initial_line_count++;
$final_count = $initial_line_count;
// echo $final_count.'<br>';
}
}
}
$line_count = 1; // loop termination counter
$html = '<table>
<tr>
<th style="border:1px solid #000">ORIGINAL LETTERS</th>
<th style="border:1px solid #000">UNICODE CHARACTER</th>
<th style="border:1px solid #000">Description</th>
</tr>';
for ($i = 0; $i < count($searchStringArray); $i++) {
$search_string = $searchStringArray[$i];
// loop to slice and append in array
foreach ($fileData() as $line) {
// $line contains current line
if (preg_match_all("/{$search_string}/i", $line)) {
// start slicing
$slice_after = substr($line, 0, strpos($line, ' ) ')); // slice everything after )
$slice_before = ltrim(stristr($slice_after, '('), '('); // slice everything upto (
$first_char = substr($slice_before, 0, strpos($slice_before, "→")); // get every first character
$split_Real_char = ltrim(stristr($search_string, 'LETTER'), 'LETTER'); // get every real character
$real_Char = $output_string['original'] .= $split_Real_char; // append to array
$split_Unicode_char = $output_string['uni-code'] .= $first_char . ','; // append to array
$line_count++; // loop termination counter
// loop termination
if ($line_count == $final_count) {
$html .= ' <tr>
<td style=" border:1px solid black;"><pre>' . $split_Real_char . '</pre></td>
<td style=" border:1px solid black;"><pre>' . $split_Unicode_char . '</pre></td>
<td style=" border:1px solid black;"><pre>' . $search_string . '</pre></td>
</tr>';
$html .= '</table>';
echo $html;
break;
}
}
}
}
and I get output
| ORIGINAL LETTER | UNICODE CHARACTER | Description |
| -------------------- | ------------------------- | --------------------------- |
| B | A, 🄐, B, ℬ, 𝐵, ℻ | LATIN CAPITAL LETTER B |
I get all the Unicode characters but issues arise with original letters and search strings. All the Unicode characters should not fall in a single table cell, also despite loop running multiple times I get a single row only.
Expected output
| ORIGINAL LETTER | UNICODE CHARACTER | Description |
| -------------------- | ------------------| ---------------------- |
| A | A, 🄐, ℻ | LATIN CAPITAL LETTER A |
| B | B, ℬ, 𝐵 | LATIN CAPITAL LETTER B |
Any suggestions how can I achieve this?
To answer my own question. I was able to get this working using a function and array_map()
.
<?php
// html table to output data
$html_table = '<table>
<thead>
<tr>
<th style="border:1px solid #000">Original Letters</th>
<th style="border:1px solid #000">Unicode Characters</th>
<th style="border:1px solid #000">Description</th>
</tr>
</thead>
<tbody>';
function my_func($search_map)
{
global $html_table;
// read file
$fileData = function () {
$file = fopen(__DIR__ . './confusable.txt', 'r');
if (!$file) {
return; // die() is a bad practice, better to use return
}
while (($line = fgets($file)) !== false) {
yield $line . '<br />';
}
fclose($file);
};
// store unicode in array
$store_unicode_array = [
'uni-code' => '',
];
$initial_line_count = 1; // variable to count lines before we start slicing
$final_count = 0; // final line count
// loop to get final count
foreach ($fileData() as $line) {
// $line contains current line
if (preg_match("/{$search_map}/i", $line)) {
$initial_line_count++;
$final_count = $initial_line_count;
}
}
$line_count = 1; // loop termination counter
foreach ($fileData() as $line) {
if (preg_match("/{$search_map}/i", $line)) {
$slice_after = substr($line, 0, strpos($line, ' ) ')); // slice everything after )
$slice_before = ltrim(stristr($slice_after, '('), '('); // slice everything upto (
$unicode_char = substr($slice_before, 0, strpos($slice_before, "→")); // get every first character
$real_char = ltrim(stristr($search_map, 'LETTER'), 'LETTER'); // get every real character
$store_unicode = $store_unicode_array['uni-code'] .= $unicode_char . ','; // append each unicode char to $store_unicode_array
$remove_comma_unicode_char = substr($store_unicode, 0, -1); // remove comma after nth last character in store_unicode
$line_count++; // increment loop termination counter
if ($line_count == $final_count) {
$html_table .= '<tr>
<td style=" border:1px solid black;"><pre>' . $real_char . '</pre></td>
<td style=" border:1px solid black;"><pre>' . $remove_comma_unicode_char . '</pre></td>
<td style=" border:1px solid black;"><pre>' . $search_map . '</pre></td>
</tr>';
break;
}
}
}
}
// initializing empty array for storing search strings
$search_string_array = array();
// some part of search string
$search_string_part = 'LATIN CAPITAL LETTER ';
// generate search strings based on $search_string_part
foreach (range('A', 'Z') as $alphabet) {
$alphabets = $search_string_part . $alphabet;
array_push($search_string_array, $alphabets);
}
array_map("my_func", $search_string_array);
// foreach ($search_string_array as $element) {
// my_func($element);
// }
$html_table .= ' </tbody>
</table>';
echo $html_table;