import os
import gc
import pandas as pd
staples_file = '../../../data/networking/Staples_Networking_2018_08_29.csv'
if os .path .exists (staples_file ):
print ('File Found at {}' .format (staples_file ))
else :
raise Exception ('File not found at {}' .format (staples_file ))
dataframe = pd .read_csv (staples_file , sep = '^' )
print ("total rows& columns: " , dataframe .shape )
rowCount = int (dataframe .shape [0 ])
File Found at ../../../data/networking/Staples_Networking_2018_08_29.csv
total rows& columns: (4809, 235)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3071: DtypeWarning: Columns (20,21,26,28,31,33,34,36,40,41,43,44,45,48,49,50,51,52,53,56,57,58,59,60,63,64,66,72,73,74,78,79,80,81,83,85,88,90,92,93,94,95,96,97,99,100,101,102,104,105,107,109,110,114,115,116,117,118,120,123,124,125,127,128,129,130,131,132,133,134,135,136,137,138,139,140,142,143,144,145,146,147,148,153,154,155,159,160,165,167,169,170,172,173,174,176,180,181,182,184,185,186,187,188,189,190,191,194,196,197,198,199,201,202,204,207,208,209,210,211,213,215,216,219,221,224,225,226,227,228,229,230,231,232,233,234) have mixed types.Specify dtype option on import or set low_memory=False.
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
Null Count across the column
print ('\n ' .join (['{:3d}.{:30s} - {:4d}' .format (i ,dataframe .columns [i ], val ) \
for i , val in enumerate (dataframe .isnull ().sum ().tolist ())]))
0.Competitor name - 0
1.Class number - 0
2.Class name - 0
3.SKU ID - 0
4.Url - 0
5.Brand - 0
6.SKU title - 0
7.Description - 8
8.Price - 22
9.List Price - 4737
10.Shipping Price - 4809
11.Subscription Price - 4809
12.Ratings - 0
13.Total no of Reviews - 0
14.Image url - 0
15.UPC - 0
16.Model number - 18
17.Manufacturer part number - 4809
18.Best Seller Rank - 4809
19.# of Ports - 4785
20.Adapter & Splitter Type - 4586
21.Adapter Type - 4730
22.Adapters, Cords & Power Type - 4684
23.Address Database Size - 4759
24.Amperage (amps) - 4795
25.Amps - 4809
26.Antenna - 4712
27.Aperture (f:) - 4808
28.Apple HomeKit Enabled - 4795
29.Auto-Detecting - 4746
30.Auto-Sensing - 4729
31.Average Life (hours) - 4808
32.Buffer Memory - 4773
33.Bulb Base Type - 4808
34.Bulb type - 4802
35.Cable Color - 4169
36.Cable Gauge (AWG) - 4655
37.Cable Jacket Material - 4237
38.Cable Length (Inches) - 4650
39.Cable Length (ft) - 3581
40.Cable Management Length (Feet) - 4735
41.Cable Management Type - 4663
42.Cable Type - 4452
43.Camera Angle Adjustment - 4808
44.Camera IR Coverage (Feet) - 4808
45.Camera Image Sensor - 4807
46.Certification & Standards - 4531
47.Color Family - 3569
48.Color Supported - 4808
49.Compatibility - 4643
50.Compatible Devices - 4801
51.Compatible Operating System - 4771
52.Compatible with Alexa - 4793
53.Compatible with Google Home - 4795
54.Computer Cable Type - 4177
55.Conductor Material - 4144
56.Connectivity - 4804
57.Connectivity (Wireless) - 4807
58.Connectivity Type - 4430
59.Connector End 1 - 4561
60.Connector End 2 - 4606
61.Connector Finish - 4219
62.Connector Gender - 3645
63.DLNA - 4803
64.DLNA Media Streaming - 4744
65.Data Rate Performance - 4763
66.Data Rate Speed - 4697
67.Data Transfer Rate (Gbps) - 4651
68.Data Transfer Rate (Mbps) - 4528
69.Depth in Inches - 4077
70.Diameter in Inches - 4807
71.Diameter-inches - 4789
72.Digital Zoom - 4806
73.Ease of Use - 4791
74.Encryption Type - 4766
75.End 1 Connector Type - 4257
76.End 2 Connector Type - 4129
77.Ethernet Number of Ports - 4517
78.Extender & Repeater Design - 4782
79.Extender & Repeater Type - 4758
80.External Drives Power - 4792
81.Features - 4802
82.Field Of View - 4808
83.Fixed or PTZ - 4808
84.Frame Rate (fps) - 4808
85.Frequency (Wireless) - 4586
86.Full-Duplex - 4721
87.Gaming - 4359
88.Gigabit - 4716
89.Gigabit Ethernet Uplink - 4741
90.HDMI Compatible - 4803
91.Height in Inches - 4036
92.High Definition - 4808
93.Home Automation Product Type - 4756
94.Home Automation Type - 4794
95.Hub Compatibility - 4805
96.Hub Type - 4743
97.IEEE - 4808
98.Input Voltage - 4772
99.Installation - 4771
100.Installation Software - 4722
101.Integrated Data Storage - 4761
102.Integrated VPN - 4760
103.Interface - 4364
104.Interface or Port - 4808
105.Interface or Port Type - 4748
106.Jumbo Frames - 4751
107.Kit Includes - 4778
108.LACP - 4738
109.LAN Ports - 4778
110.Language Options - 4806
111.Layer - 4708
112.Length (ft) - 4616
113.Length in Inches - 4709
114.Lens - 4808
115.Material of Item - 4765
116.Media Players Features - 4806
117.Media Players Type - 4806
118.Megapixels - 4808
119.Micro - 4460
120.Microphones Frequency - 4808
121.Mini - 4447
122.Minimum Illumination - 4808
123.Modem Form Factor - 4787
124.Modem Ports - 4786
125.Modem Standard Type - 4754
126.Module Support - 4759
127.Monitor Cable Type - 4741
128.NAS DLNA Media Streaming - 4775
129.NAS Ethernet (RJ-45) Ports - 4773
130.NAS Form Factor - 4758
131.NAS HDD Interface - 4792
132.NAS Hot Swappable - 4773
133.NAS Included HDD Capacity - 4783
134.NAS Mobile App Support - 4796
135.NAS RAID Support - 4759
136.NAS Recommended Use - 4788
137.NAS Remote Access - 4789
138.NAS Series - 4790
139.NAS USB Ports - 4780
140.NAS eSATA Ports - 4796
141.Network Management Type - 4521
142.Network Platform - 4778
143.Network Storage Capacity - 4786
144.Network Storage Type - 4778
145.Networking & WiFi Features - 4804
146.Networking Accessory Type - 4601
147.Night Vision Capable - 4807
148.Night Vision Type - 4808
149.Number Of USB Ports - 4791
150.Number of Bays - 4753
151.Number of Ports - 4738
152.Number of Security Cameras Included - 4807
153.OS Compatibility - 4803
154.On-Screen Display - 4789
155.Output Voltage - 4808
156.Pack Qty - 4687
157.Pack Size - 4802
158.Pan Angle (Degrees) - 4808
159.Parental Controls - 4712
160.Pass Through Power Plug - 4801
161.Performance Class - 4729
162.Pins - 4506
163.PoE - 4730
164.Port Mirroring - 4704
165.Power - 4807
166.Power Consumption (Watts) - 4807
167.Power Source - 4793
168.Power Supply - 4757
169.Power-Voltage - 4806
170.Powerline Adapter Ports - 4805
171.Powerline Adapter Speed (Mbps) - 4804
172.Powerline Security - 4803
173.Printer Cable Interface - 4806
174.Processor - 4801
175.Processor Speed (Ghz) - 4801
176.Product Color Family - 4402
177.Protocols - 4777
178.QoS - 4731
179.RAM Desktop Memory (GB) - 4803
180.Requirements - 4802
181.Resolution - 4794
182.Router Number of Ports - 4714
183.SNMP - 4736
184.Screen Resolution - 4804
185.Security - 4808
186.Security (Routers) - 4697
187.Security Camera Audio Output - 4808
188.Security Camera Commercial or Residential - 4808
189.Security Camera Computer Capability - 4808
190.Security Camera Connectivity - 4806
191.Security Camera Indoor or Outdoor - 4808
192.Security Camera Memory Size (MB) - 4808
193.Serial ATA (SATA) - 4526
194.Series - 4808
195.Series or Collection - 4645
196.Skype Certified - 4808
197.Specialty Paper Type - 4808
198.Streaming Media Player Connectivity Type - 4806
199.Streaming Media Player Interface - 4803
200.Super Thin - 4492
201.Supported Internet Services - 4804
202.Supported Operating System - 4808
203.Switch Form Factor - 4731
204.TV Accessory Type - 4808
205.Tilt Angle (deg) - 4808
206.True Color - 3775
207.Type of Printer Cable - 4803
208.USB Adapter & Access Point Type - 4703
209.USB Cable Type - 4600
210.USB Port - 4525
211.USB Type - 4760
212.VLAN Support - 4728
213.VPN Support - 4764
214.Video Cable Type - 4358
215.Video Compression - 4808
216.Voltage Rating - 4786
217.Warranty - 3690
218.Warranty Information - 4170
219.Webcam Design - 4808
220.Weight (Ounces) - 4494
221.Weight (lbs) - 3696
222.Width in Inches - 4018
223.Wifi (Feet) - 4801
224.Wired Security Cameras Type - 4808
225.Wireless Bands - 4632
226.Wireless Connectivity - 4673
227.Wireless Connectivity Filter - 4803
228.Wireless Data Access - 4707
229.Wireless Data Transfer Rate (Mbps) - 4764
230.Wireless Dual-Band - 4724
231.Wireless Frequency - 4753
232.Wireless Technology - 4527
233.Wireless Technology Speed - 4699
234.Zoom - 4807
Print the column name where null count > 75 % of row count
columns_2b_deleted = []
for i , count in enumerate (dataframe .isnull ().sum ().tolist ()):
if count >= rowCount * .75 :
columns_2b_deleted .append (dataframe .columns [i ])
print ('{} Columns 2 b deleted ' .format (len (columns_2b_deleted )))
print (columns_2b_deleted )
dataframe .drop (columns_2b_deleted ,axis = 1 , inplace = True )
print ("file shape after dropping : " , dataframe .shape )
print ('\n ' .join (['{:3d}.{:30s} - {:4d}' .format (i ,dataframe .columns [i ], val ) \
for i , val in enumerate (dataframe .isnull ().sum ().tolist ())]))
dataframe .drop (['Competitor name' ,'Class number' ,'Class name' ,'Url' ,'List Price' ,'Shipping Price' ,'Subscription Price' ,\
'Image url' ,'Certification & Standards' ,'Color Family' ,'Compatible Devices' ,\
'Manufacturer part number' ,'Compatible Operating System' ,'Connectivity Type' ,\
'Best Seller Rank' ,'# of Ports' ,'Adapter & Splitter Type' ,'Adapter Type' ,'Adapters, Cords & Power Type' ,'Address Database Size' ,'Amperage (amps)' ,'Amps' ,'Antenna' ,'Aperture (f:)' ,'Apple HomeKit Enabled' ,'Auto-Detecting' ,'Auto-Sensing' ,'Average Life (hours)' ,'Buffer Memory' ,'Bulb Base Type' ,'Bulb type' ,'Cable Color' ,'Cable Gauge (AWG)' ,'Cable Jacket Material' ,'Cable Length (Inches)' ,'Cable Length (ft)' ,'Cable Management Length (Feet)' ,'Cable Management Type' ,'Cable Type' ,'Camera Angle Adjustment' ,'Camera IR Coverage (Feet)' ,'Camera Image Sensor' ,'Color Supported' ,'Compatibility' ,'Compatible with Alexa' ,'Compatible with Google Home' ,'Computer Cable Type' ,'Conductor Material' ,'Connectivity' ,'Connectivity (Wireless)' ,'Connector End 1' ,'Connector End 2' ,'Connector Finish' ,'Connector Gender' ,'DLNA' ,'Connector End 1' ,'Connector End 2' ,'Connector Finish' ,'Connector Gender' ,'DLNA' ,'DLNA Media Streaming' ,'Data Rate Performance' ,'Data Rate Speed' ,'Data Transfer Rate (Gbps)' ,'Data Transfer Rate (Mbps)' ,'Depth in Inches' ,'Diameter in Inches' ,'Diameter-inches' ,'Digital Zoom' ,'Ease of Use' ,'Encryption Type' ,'End 1 Connector Type' ,'End 2 Connector Type' ,'Ethernet Number of Ports' ,'Extender & Repeater Design' ,'Extender & Repeater Type' ,'External Drives Power' ,'Features' ,'Field Of View' ,'Fixed or PTZ' ,'Frame Rate (fps)' ,'Frequency (Wireless)' ,'Full-Duplex' ,'Gaming' ,'Gigabit' ,'Gigabit Ethernet Uplink' ,'HDMI Compatible' ,'Height in Inches' ,'High Definition' ,'Home Automation Product Type' ,'Home Automation Type' ,'Hub Compatibility' ,'Hub Type' ,'IEEE' ,'Input Voltage' ,'Installation' ,'Installation Software' ,'Integrated Data Storage' ,'Integrated VPN' ,'Interface' ,'Interface or Port' ,'Interface or Port Type' ,'Jumbo Frames' ,'Kit Includes' ,'LACP' ,'LAN Ports' ,'Language Options' ,'Layer' ,'Length (ft)' ,'Length in Inches' ,'Lens' ,'Material of Item' ,'Media Players Features' ,'Media Players Type' ,'Megapixels' ,'Micro' ,'Microphones Frequency' ,'Mini' ,'Minimum Illumination' ,'Modem Form Factor' ,'Modem Ports' ,'Modem Standard Type' ,'Module Support' ,'Monitor Cable Type' ,'NAS DLNA Media Streaming' ,'NAS Ethernet (RJ-45) Ports' ,'NAS Form Factor' ,'NAS HDD Interface' ,'NAS Hot Swappable' ,'NAS Included HDD Capacity' ,'NAS Mobile App Support' ,'NAS RAID Support' ,'NAS Recommended Use' ,'NAS Remote Access' ,'NAS Series' ,'NAS USB Ports' ,'NAS eSATA Ports' ,'Network Management Type' ,'Network Platform' ,'Network Storage Capacity' ,'Network Storage Type' ,'Networking & WiFi Features' ,'Networking Accessory Type' ,'Night Vision Capable' ,'Night Vision Type' ,'Number Of USB Ports' ,'Number of Bays' ,'Number of Ports' ,'Number of Security Cameras Included' ,'OS Compatibility' ,'On-Screen Display' ,'Output Voltage' ,'Pack Qty' ,'Pack Size' ,'Pan Angle (Degrees)' ,'Parental Controls' ,'Pass Through Power Plug' ,'Performance Class' ,'Pins' ,'PoE' ,'Port Mirroring' ,'Power' ,'Power Consumption (Watts)' ,'Power Source' ,'Power Supply' ,'Power-Voltage' ,'Powerline Adapter Ports' ,'Powerline Adapter Speed (Mbps)' ,'Powerline Security' ,'Printer Cable Interface' ,'Processor' ,'Processor Speed (Ghz)' ,'Product Color Family' ,'Protocols' ,'QoS' ,'RAM Desktop Memory (GB)' ,'Requirements' ,'Resolution' ,'Router Number of Ports' ,'SNMP' ,'Screen Resolution' ,'Security' ,'Security (Routers)' ,'Security Camera Audio Output' ,'Security Camera Commercial or Residential' ,'Security Camera Computer Capability' ,'Security Camera Connectivity' ,'Security Camera Indoor or Outdoor' ,'Security Camera Memory Size (MB)' ,'Serial ATA (SATA)' ,'Series' ,'Series or Collection' ,'Skype Certified' ,'Specialty Paper Type' ,'Streaming Media Player Connectivity Type' ,'Streaming Media Player Interface' ,'Super Thin' ,'Supported Internet Services' ,'Supported Operating System' ,'Switch Form Factor' ,'TV Accessory Type' ,'Tilt Angle (deg)' ,'True Color' ,'Type of Printer Cable' ,'USB Adapter & Access Point Type' ,'USB Cable Type' ,'USB Port' ,'USB Type' ,'VLAN Support' ,'VPN Support' ,'Video Cable Type' ,'Video Compression' ,'Voltage Rating' ,'Warranty' ,'Warranty Information' ,'Webcam Design' ,'Weight (Ounces)' ,'Weight (lbs)' ,'Width in Inches' ,'Wifi (Feet)' ,'Wired Security Cameras Type' ,'Wireless Bands' ,'Wireless Connectivity' ,'Wireless Connectivity Filter' ,'Wireless Data Access' ,'Wireless Data Transfer Rate (Mbps)' ,'Wireless Dual-Band' ,'Wireless Frequency' ,'Wireless Technology' ,'Wireless Technology Speed' ,'Zoom' ], axis = 1 , inplace = True )
print ("Current Shape: " , dataframe .shape )
Current Shape: (4809, 9)
print ('\n ' .join (['{:3d}.{:30s} - {:4d}' .format (i ,dataframe .columns [i ], val ) \
for i , val in enumerate (dataframe .isnull ().sum ().tolist ())]))
0.SKU ID - 0
1.Brand - 0
2.SKU title - 0
3.Description - 8
4.Price - 22
5.Ratings - 0
6.Total no of Reviews - 0
7.UPC - 0
8.Model number - 18
Finding Duplicate Across Rows
duplicate = dataframe [dataframe .duplicated ()]
print (' duplicate Data Frame Size {} ' .format (duplicate .shape ))
duplicate Data Frame Size (9, 9)
dataframe .drop_duplicates (keep = 'first' , inplace = True )
print ('Shape of dataframe after removing duplicates {} ' .format (dataframe .shape ))
Shape of dataframe after removing duplicates (4800, 9)
dataframe .replace ({'NaN' : None , 'None' : None , 'na' : None }, inplace = True )
print ("Current Shape: " , dataframe .shape )
Current Shape: (4800, 9)
Finding Unique key / Primary key
print ("Total unique SKU ID count : {0}, Total SKU ID count : {1}" .format (dataframe ['SKU ID' ].nunique (),dataframe ['SKU ID' ].count ()))
print ("Total unique UPC count : {0}, Total UPC count : {1}" .format (dataframe ['UPC' ].nunique (),dataframe ['UPC' ].count ()))
print ("Total unique Model number count : {0}, Total model number count : {1}" .format (dataframe ['Model number' ].nunique (),dataframe ['Model number' ].count ()))
Total unique SKU ID count : 4799, Total SKU ID count : 4800
Total unique UPC count : 4562, Total UPC count : 4800
Total unique Model number count : 4778, Total model number count : 4782
Finding is there any Null in Primary key.
dataframe ['SKU ID' ].isnull ().sum ()
0
Deleting Null values in Primary key
dataframe .dropna (subset = ['SKU ID' ], inplace = True )
print ('DataFrame Current Size {} ' .format (dataframe .shape ))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-23-e527cbc62c1d> in <module>
----> 1 dataframe.dropna(subset=['SKU ID'], inplace=True)
2 print('DataFrame Current Size {} '.format(dataframe.shape))
NameError: name 'dataframe' is not defined
Finding Duplicate in Primary Key
dataframe .duplicated (subset = ['SKU ID' ]).sum ()
1
Deleting Duplicate Primary key
dataframe .drop_duplicates (subset = ['SKU ID' ],keep = 'first' , inplace = True )
print ('DataFrame Current Size {} ' .format (dataframe .shape ))
DataFrame Current Size (4799, 9)
dataframe .to_csv ('../networking_staples_cleaned_data.csv' , sep = '^' , index = False )
print ("Current Shape: " , dataframe .shape )
del dataframe , duplicate
gc .collect ()
254
description
Value
row count
4799
column Count
9
Unique-key
SKU ID