From cc6870b225084e1738f75d14cc182a951a82f636 Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Mon, 28 Jun 2021 16:48:46 -0400 Subject: [PATCH 1/9] Enhance csv sniffer has_headers() to be more accurate --- Lib/csv.py | 2 +- Lib/test/test_csv.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index dc85077f3ec6636..f2ee0e169317119 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -410,7 +410,7 @@ def has_header(self, sample): for col in list(columnTypes.keys()): - for thisType in [int, float, complex]: + for thisType in [complex, float, int]: try: thisType(row[col]) break diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 225f5c7289081a0..3a5ed27de28b1a2 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1231,7 +1231,6 @@ def test_ordered_dict_reader(self): OrderedDict([('fname', 'John'), ('lname', 'Cleese')]), ]) - class MiscTestCase(unittest.TestCase): def test__all__(self): extra = {'__doc__', '__version__'} @@ -1241,5 +1240,19 @@ def test_subclassable(self): # issue 44089 class Foo(csv.Error): ... +class TestSniffer(unittest.TestCase): + mixed = '''"time","forces" +0,0 +0.5,0.9 +''' + mixed2 = '''"time","forces" +0.5,0.9 +0,0 +''' + def test_issue43625(self): + sniffer = csv.Sniffer() + self.assertTrue(sniffer.has_header(self.mixed)) + self.assertTrue(sniffer.has_header(self.mixed2)) + if __name__ == '__main__': unittest.main() From 780f265099358a0e36e27c708623d44b8bc0b405 Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Mon, 28 Jun 2021 16:51:01 -0400 Subject: [PATCH 2/9] restore blank --- Lib/test/test_csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 3a5ed27de28b1a2..521b8589ac30c71 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1231,6 +1231,7 @@ def test_ordered_dict_reader(self): OrderedDict([('fname', 'John'), ('lname', 'Cleese')]), ]) + class MiscTestCase(unittest.TestCase): def test__all__(self): extra = {'__doc__', '__version__'} From 0379ceef4d0e7ece16041ed5c98ea35f7527a0d4 Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 09:16:23 -0400 Subject: [PATCH 3/9] Update has_header logic and test; add has_header string test --- Lib/csv.py | 12 ++++------- Lib/test/test_csv.py | 47 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index f2ee0e169317119..088a1ff8a0c556e 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -409,14 +409,10 @@ def has_header(self, sample): continue # skip rows that have irregular number of columns for col in list(columnTypes.keys()): - - for thisType in [complex, float, int]: - try: - thisType(row[col]) - break - except (ValueError, OverflowError): - pass - else: + thisType = complex + try: + thisType(row[col]) + except: # fallback to length of string thisType = len(row[col]) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 521b8589ac30c71..2771623e1ad448c 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1242,18 +1242,47 @@ def test_subclassable(self): class Foo(csv.Error): ... class TestSniffer(unittest.TestCase): - mixed = '''"time","forces" -0,0 -0.5,0.9 -''' - mixed2 = '''"time","forces" -0.5,0.9 -0,0 -''' + mixed = dedent(""""time","forces" + 1,1.5 + 0.5,5+0j + 0,0 + 1+1j,6 + """) + + mixed2 = dedent(""""time","forces" + 0,0 + 1,2 + a,b + """) + + mixed3 = dedent(""""time","forces" + 0,0 + 1,2 + a,b + """) + + sample10 = dedent(""" + abc,def + ghijkl,mno + ghi,jkl + """) + + sample11 = dedent(""" + abc,def + ghijkl,mnop + ghi,jkl + """) + def test_issue43625(self): sniffer = csv.Sniffer() self.assertTrue(sniffer.has_header(self.mixed)) - self.assertTrue(sniffer.has_header(self.mixed2)) + self.assertFalse(sniffer.has_header(self.mixed2)) + + def test_has_header_strings(self): + # More to document existing (unexpected?) behavior than anything else. + sniffer = csv.Sniffer() + self.assertFalse(sniffer.has_header(self.sample10)) + self.assertFalse(sniffer.has_header(self.sample11)) if __name__ == '__main__': unittest.main() From af50f1ee9252e0655180c0cc3d5b95ac2b8f952b Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 09:43:49 -0400 Subject: [PATCH 4/9] Update has_header docs --- Doc/library/csv.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 7a72c26d5badeb2..a27f173838cb453 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -268,7 +268,20 @@ The :mod:`csv` module defines the following classes: .. method:: has_header(sample) Analyze the sample text (presumed to be in CSV format) and return - :const:`True` if the first row appears to be a series of column headers. + :const:`True` if the first row appears to be a series of column + headers. One of two key criteria will be considered to guess if the sample contains a + header: + + - the second through n-th rows contain numeric values + - the second through n-th rows contain strings where at least one value's length + differs from that of the putative header of that column. + + Twenty rows after the first are sampled, if more than half of columns + rows meet the + criteria, :const:`True` is returned. + + .. note:: + + This method is a rough heuristic and may produce both false positives and negatives. An example for :class:`Sniffer` use:: From bef3f0666a3d0a33ba1fecdbd574d4cf3ed70ef8 Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 09:48:47 -0400 Subject: [PATCH 5/9] Add news entry --- .../next/Library/2021-06-29-07-27-08.bpo-43625.ZlAxhp.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2021-06-29-07-27-08.bpo-43625.ZlAxhp.rst diff --git a/Misc/NEWS.d/next/Library/2021-06-29-07-27-08.bpo-43625.ZlAxhp.rst b/Misc/NEWS.d/next/Library/2021-06-29-07-27-08.bpo-43625.ZlAxhp.rst new file mode 100644 index 000000000000000..a21975b948ef9e3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-06-29-07-27-08.bpo-43625.ZlAxhp.rst @@ -0,0 +1,2 @@ +Fix a bug in the detection of CSV file headers by +:meth:`csv.Sniffer.has_header` and improve documentation of same. From 73a43b1adfcd95f79e7fde42eb21281dc6c38c32 Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 09:53:47 -0400 Subject: [PATCH 6/9] move new tests to proper sniffer test class --- Lib/test/test_csv.py | 79 ++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 43 deletions(-) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 2771623e1ad448c..53893b7a27c7698 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1014,6 +1014,42 @@ class TestSniffer(unittest.TestCase): 'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back' """ + sample10 = dedent(""" + abc,def + ghijkl,mno + ghi,jkl + """) + + sample11 = dedent(""" + abc,def + ghijkl,mnop + ghi,jkl + """) + + sample12 = dedent(""""time","forces" + 1,1.5 + 0.5,5+0j + 0,0 + 1+1j,6 + """) + + sample13 = dedent(""""time","forces" + 0,0 + 1,2 + a,b + """) + + def test_issue43625(self): + sniffer = csv.Sniffer() + self.assertTrue(sniffer.has_header(self.sample12)) + self.assertFalse(sniffer.has_header(self.sample13)) + + def test_has_header_strings(self): + # More to document existing (unexpected?) behavior than anything else. + sniffer = csv.Sniffer() + self.assertFalse(sniffer.has_header(self.sample10)) + self.assertFalse(sniffer.has_header(self.sample11)) + def test_has_header(self): sniffer = csv.Sniffer() self.assertIs(sniffer.has_header(self.sample1), False) @@ -1241,48 +1277,5 @@ def test_subclassable(self): # issue 44089 class Foo(csv.Error): ... -class TestSniffer(unittest.TestCase): - mixed = dedent(""""time","forces" - 1,1.5 - 0.5,5+0j - 0,0 - 1+1j,6 - """) - - mixed2 = dedent(""""time","forces" - 0,0 - 1,2 - a,b - """) - - mixed3 = dedent(""""time","forces" - 0,0 - 1,2 - a,b - """) - - sample10 = dedent(""" - abc,def - ghijkl,mno - ghi,jkl - """) - - sample11 = dedent(""" - abc,def - ghijkl,mnop - ghi,jkl - """) - - def test_issue43625(self): - sniffer = csv.Sniffer() - self.assertTrue(sniffer.has_header(self.mixed)) - self.assertFalse(sniffer.has_header(self.mixed2)) - - def test_has_header_strings(self): - # More to document existing (unexpected?) behavior than anything else. - sniffer = csv.Sniffer() - self.assertFalse(sniffer.has_header(self.sample10)) - self.assertFalse(sniffer.has_header(self.sample11)) - if __name__ == '__main__': unittest.main() From c7151546ea83370411a2827db99188255d5298ee Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 10:13:10 -0400 Subject: [PATCH 7/9] Improve the doc section --- Doc/library/csv.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index a27f173838cb453..cb03f8da20235f9 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -268,20 +268,21 @@ The :mod:`csv` module defines the following classes: .. method:: has_header(sample) Analyze the sample text (presumed to be in CSV format) and return - :const:`True` if the first row appears to be a series of column - headers. One of two key criteria will be considered to guess if the sample contains a - header: + :const:`True` if the first row appears to be a series of column headers. + Inspecting each column, one of two key criteria will be considered to + estimate if the sample contains a header: - the second through n-th rows contain numeric values - - the second through n-th rows contain strings where at least one value's length - differs from that of the putative header of that column. + - the second through n-th rows contain strings where at least one value's + length differs from that of the putative header of that column. - Twenty rows after the first are sampled, if more than half of columns + rows meet the - criteria, :const:`True` is returned. + Twenty rows after the first row are sampled; if more than half of columns + + rows meet the criteria, :const:`True` is returned. .. note:: - This method is a rough heuristic and may produce both false positives and negatives. + This method is a rough heuristic and may produce both false positives and + negatives. An example for :class:`Sniffer` use:: From 3d1ac3e58ef8c43ee915b47f8e6347853de8f6ba Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 10:25:06 -0400 Subject: [PATCH 8/9] change back to test docstring --- Lib/test/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 53893b7a27c7698..7cb56d3a08236cc 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1045,7 +1045,7 @@ def test_issue43625(self): self.assertFalse(sniffer.has_header(self.sample13)) def test_has_header_strings(self): - # More to document existing (unexpected?) behavior than anything else. + "More to document existing (unexpected?) behavior than anything else." sniffer = csv.Sniffer() self.assertFalse(sniffer.has_header(self.sample10)) self.assertFalse(sniffer.has_header(self.sample11)) From 84dcaa703b359c6283a3b317c85668f49594387a Mon Sep 17 00:00:00 2001 From: andrei kulakov Date: Tue, 29 Jun 2021 12:07:21 -0400 Subject: [PATCH 9/9] go back to catching specific numeric exceptions --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 088a1ff8a0c556e..bb3ee269ae7931f 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -412,7 +412,7 @@ def has_header(self, sample): thisType = complex try: thisType(row[col]) - except: + except (ValueError, OverflowError): # fallback to length of string thisType = len(row[col])