77from random import Random
88
99import conllu
10+ from chardet import UniversalDetector
1011from django .db import transaction
1112from django .conf import settings
1213import pyexcel
@@ -245,7 +246,8 @@ class CoNLLParser(FileParser):
245246 """
246247 def parse (self , file ):
247248 data = []
248- file = io .TextIOWrapper (file , encoding = 'utf-8' )
249+ file = EncodedIO (file )
250+ file = io .TextIOWrapper (file , encoding = file .encoding )
249251
250252 # Add check exception
251253
@@ -300,7 +302,8 @@ class PlainTextParser(FileParser):
300302 ```
301303 """
302304 def parse (self , file ):
303- file = io .TextIOWrapper (file , encoding = 'utf-8' )
305+ file = EncodedIO (file )
306+ file = io .TextIOWrapper (file , encoding = file .encoding )
304307 while True :
305308 batch = list (itertools .islice (file , settings .IMPORT_BATCH_SIZE ))
306309 if not batch :
@@ -323,7 +326,8 @@ class CSVParser(FileParser):
323326 ```
324327 """
325328 def parse (self , file ):
326- file = io .TextIOWrapper (file , encoding = 'utf-8' )
329+ file = EncodedIO (file )
330+ file = io .TextIOWrapper (file , encoding = file .encoding )
327331 reader = csv .reader (file )
328332 yield from ExcelParser .parse_excel_csv_reader (reader )
329333
@@ -364,7 +368,8 @@ def parse_excel_csv_reader(reader):
364368class JSONParser (FileParser ):
365369
366370 def parse (self , file ):
367- file = io .TextIOWrapper (file , encoding = 'utf-8' )
371+ file = EncodedIO (file )
372+ file = io .TextIOWrapper (file , encoding = file .encoding )
368373 data = []
369374 for i , line in enumerate (file , start = 1 ):
370375 if len (data ) >= settings .IMPORT_BATCH_SIZE :
@@ -506,3 +511,34 @@ def readinto(self, b):
506511 return 0 # indicate EOF
507512
508513 return io .BufferedReader (IterStream (), buffer_size = buffer_size )
514+
515+
516+ class EncodedIO (io .RawIOBase ):
517+ def __init__ (self , fobj , buffer_size = io .DEFAULT_BUFFER_SIZE , default_encoding = 'utf-8' ):
518+ buffer = b''
519+ detector = UniversalDetector ()
520+
521+ while True :
522+ read = fobj .read (buffer_size )
523+ detector .feed (read )
524+ buffer += read
525+ if detector .done or len (read ) < buffer_size :
526+ break
527+
528+ if detector .done :
529+ self .encoding = detector .result ['encoding' ]
530+ else :
531+ self .encoding = default_encoding
532+
533+ self ._fobj = fobj
534+ self ._buffer = buffer
535+
536+ def readable (self ):
537+ return self ._fobj .readable ()
538+
539+ def readinto (self , b ):
540+ l = len (b )
541+ chunk = self ._buffer or self ._fobj .read (l )
542+ output , self ._buffer = chunk [:l ], chunk [l :]
543+ b [:len (output )] = output
544+ return len (output )
0 commit comments