I have a huge dataset in the Arabic language, I cleaned the data from special characters, English characters. But, I discovered that the dataset contains many other languages like Chinese, Japanese, Russian, etc. The problem is that I can't tell exactly what other languages are there mixed with the Arabic language, so I need a solution to remove everything in the text rather than Arabic characters from a pandas data frame. here is my code:
def clean_txt(input_str):
try:
if input_str: # if the input string is not empty do the following
input_str = re.sub('[?؟!@#$%&*+~\/=><]+^' , '' , input_str) # Remove some of special chars
input_str=re.sub(r'[a-zA-Z?]', '', input_str).strip() # remove english chars
input_str = re.sub('[\\s]+'," ",input_str) # Remove all spaces
input_str = input_str.replace("_" , ' ') #Remove underscore
input_str = input_str.replace("ـ" , '') # Remove Arabic tatwelah
input_str =input_str.replace('"','')# Remove "
input_str =input_str.replace("''",'')# Remove ''
input_str =input_str.replace("'",'')# Remove '
input_str =input_str.replace(".",'')# Remove .
input_str =input_str.replace(",",'')# Remove ,
input_str =input_str.replace(":",' ')# Remove :
input_str=re.sub(r" ?\([^)]+\)", "", str(input_str)) #Remove text between ()
input_str = input_str.strip() # Trim input string
except:
return input_str
return input_str
Finally, I found the answer:
text ='大谷育江 صباح الخيرfff :"""%#$@&!~(2009 مرحباً Добро пожаловать fffff أحمــــد ݓ'
t = re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text)
t
' صباح الخير 2009 مرحباً أحمــــد ݓ'