76 lines
2.9 KiB
Python
76 lines
2.9 KiB
Python
import app.mathsHelper
|
|
import string
|
|
class dictionaryChecker:
|
|
"""
|
|
Class designed to confirm whether something is **language** based on how many words of **language** appears
|
|
Call confirmlanguage(text, language)
|
|
* text: the text you want to confirm
|
|
* language: the language you want to confirm
|
|
|
|
Find out what language it is by using chisquared.py, the highest chisquared score is the language
|
|
languageThreshold = 45
|
|
if a string is 45% **language** words, then it's confirmed to be english
|
|
"""
|
|
def __init__(self):
|
|
self.mh = app.mathsHelper.mathsHelper()
|
|
self.languagePercentage = 0.0
|
|
self.languageWordsCounter = 0.0
|
|
self.languageThreshold = 55
|
|
|
|
def checkDictionary(self, text, language):
|
|
"""Compares a word with
|
|
The dictionary is sorted and the text is sorted"""
|
|
# reads through most common words / passwords
|
|
# and calculates how much of that is in language
|
|
text = text.lower()
|
|
text = self.mh.stripPuncuation(text)
|
|
text = text.split(" ")
|
|
text = list(set(text)) # removes duplicate words
|
|
text.sort()
|
|
# can dynamically use languages then
|
|
language = str(language) + ".txt"
|
|
file = open("app/languageCheckerMod/English.txt", "r")
|
|
f = file.readlines()
|
|
file.close()
|
|
f = [x.strip().lower() for x in f]
|
|
# dictionary is "word\n" so I remove the "\n"
|
|
|
|
# so this should loop until it gets to the point in the @staticmethod
|
|
# that equals the word :)
|
|
|
|
"""
|
|
for every single word in main dictionary
|
|
if that word == text[0] then +1 to counter
|
|
then +1 to text[0 + i]
|
|
so say the dict is ordered
|
|
we just loop through dict
|
|
and eventually we'll reach a point where word in dict = word in text
|
|
at that point, we move to the next text point
|
|
both text and dict are sorted
|
|
so we only loop once, we can do this in O(n log n) time
|
|
"""
|
|
counter = 0
|
|
counter_percent = 0
|
|
|
|
for dictLengthCounter, word in enumerate(f):
|
|
# if there is more words counted than there is text
|
|
# it is 100%, sometimes it goes over
|
|
# so this stops that
|
|
if counter >= len(text):
|
|
break
|
|
# if the dictionary word is contained in the text somewhere
|
|
# counter + 1
|
|
if word in text:
|
|
counter = counter + 1
|
|
counter_percent = counter_percent + 1
|
|
self.languageWordsCounter = counter
|
|
self.languagePercentage = self.mh.percentage(float(self.languageWordsCounter), float(len(text)))
|
|
return counter
|
|
|
|
def confirmlanguage(self, text, language):
|
|
self.checkDictionary(text, language)
|
|
if self.languagePercentage > self.languageThreshold:
|
|
return True
|
|
else:
|
|
return False
|