{"id":6749,"date":"2023-12-18T03:49:15","date_gmt":"2023-12-18T03:49:15","guid":{"rendered":"https:\/\/www.garysieling.com\/blog\/?p=6749"},"modified":"2023-12-18T03:49:41","modified_gmt":"2023-12-18T03:49:41","slug":"scraping-pa-house-senate-committees-to-json-or-csv","status":"publish","type":"post","link":"https:\/\/www.garysieling.com\/blog\/scraping-pa-house-senate-committees-to-json-or-csv\/","title":{"rendered":"Scraping PA House\/Senate Committees to JSON or CSV"},"content":{"rendered":"\n<p>The PA House\/Senate contain a list of committee assignments, but there is no way to get this to an Excel format.<\/p>\n\n\n\n<p>The below script does the following:<\/p>\n\n\n\n<ul><li>Extract the assignments as listed<\/li><li>Mark each rep to their committees and subcommitees<\/li><li>Link to district, party in a CSV<\/li><\/ul>\n\n\n\n<p><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>const fs = require('fs');\r\nconst house = fs.readFileSync('.\/scraping\/legislature\/House Member Committee Assignments - PA House of Representatives.html') + '';\r\n\r\nconst Papa = require('papaparse');\r\nlet cheerio = require('cheerio');\r\n\r\r\n  const nameToDistrict = {};\r\n  const allCommittees = {};\r\n  \r\n  `Name\t\tParty\tDistrict\r\n  Aerion Abney\t\tD\t19\r\n  Joseph Adams\t\tR\t139\r\n  Mike Armanini\t\tR\t75\r\n  Jacob Banta\t\tR\t4\r\n  Jamie Barton\t\tR\t124\r\n  Anthony Bellmon\t\tD\t203\r\n  Jessica Benham\t\tD\t36\r\n  Kerry Benninghoff\t\tR\t171\r\n  Aaron Bernstine\t\tR\t8\r\n  Ryan Bizzarro\t\tD\t3\r\n  Timothy Bonner\t\tR\t17\r\n  Stephanie Borowicz\t\tR\t76\r\n  Lisa Borowski\t\tD\t168\r\n  Heather Boyd\t\tD\t163\r\n  Kevin Boyle\t\tD\t172\r\n  Matthew Bradford\t\tD\t70\r\n  Tim Brennan\t\tD\t29\r\n  Tim Briggs\t\tD\t149\r\n  Amen Brown\t\tD\t10\r\n  Marla Brown\t\tR\t9\r\n  Donna Bullock\t\tD\t195\r\n  Danilo Burgos\t\tD\t197\r\n  Frank Burns\t\tD\t72\r\n  Mike Cabell\t\tR\t117\r\n  Martin Causer\t\tR\t67\r\n  Johanny Cepeda-Freytiz\t\tD\t129\r\n  Morgan Cephas\t\tD\t192\r\n  Melissa Cerrato\t\tD\t151\r\n  Joe Ciresi\t\tD\t146\r\n  Scott Conklin\t\tD\t77\r\n  Bud Cook\t\tR\t50\r\n  Jill Cooper\t\tR\t55\r\n  Gina Curry\t\tD\t164\r\n  Bryan Cutler\t\tR\t100\r\n  Joseph D'Orsie\t\tR\t47\r\n  Mary Jo Daley\t\tD\t148\r\n  Eric Davanzo\t\tR\t58\r\n  Tina Davis\t\tD\t141\r\n  Jason Dawkins\t\tD\t179\r\n  Daniel Deasy\t\tD\t27\r\n  David Delloso\t\tD\t162\r\n  Sheryl Delozier\t\tR\t88\r\n  Russ Diamond\t\tR\t102\r\n  Kyle Donahue\t\tD\t113\r\n  George Dunbar\t\tR\t56\r\n  Torren Ecker\t\tR\t193\r\n  Joe Emrick\t\tR\t137\r\n  Mindy Fee\t\tR\t37\r\n  Elizabeth Fiedler\t\tD\t184\r\n  Wendy Fink\t\tR\t94\r\n  Justin Fleming\t\tD\t105\r\n  Jamie Flick\t\tR\t83\r\n  Ann Flood\t\tR\t138\r\n  Dan Frankel\t\tD\t23\r\n  Robert Freeman\t\tD\t136\r\n  Paul Friel\t\tD\t26\r\n  Jonathan Fritz\t\tR\t111\r\n  Pat Gallagher\t\tD\t173\r\n  John Galloway\t\tD\t140\r\n  Valerie Gaydos\t\tR\t44\r\n  Matthew Gergely\t\tD\t35\r\n  Mark Gillen\t\tR\t128\r\n  Jose Giral\t\tD\t180\r\n  Barbara Gleim\t\tR\t199\r\n  G. Roni Green\t\tD\t190\r\n  Jim Gregory\t\tR\t80\r\n  Keith Greiner\t\tR\t43\r\n  Seth Grove\t\tR\t196\r\n  Nancy Guenst\t\tD\t152\r\n  Manuel Guzman Jr.\t\tD\t127\r\n  Jim Haddock\t\tD\t118\r\n  Joe Hamm\t\tR\t84\r\n  Liz Hanbidge\t\tD\t61\r\n  Patrick Harkins\t\tD\t1\r\n  Jordan Harris\t\tD\t186\r\n  Doyle Heffley\t\tR\t122\r\n  Carol Hill-Evans\t\tD\t95\r\n  Joe Hogan\t\tR\t142\r\n  Joseph C. Hohenstein\t\tD\t177\r\n  Kristine Howard\t\tD\t167\r\n  Rich Irvin\t\tR\t81\r\n  MaryLouise Isaacson\t\tD\t175\r\n  R. Lee James\t\tR\t64\r\n  Mike Jones\t\tR\t93\r\n  Tom Jones\t\tR\t98\r\n  Barry Jozwiak\t\tR\t5\r\n  Joshua Kail\t\tR\t15\r\n  Aaron Kaufer\t\tR\t120\r\n  Rob Kauffman\t\tR\t89\r\n  Carol Kazeem\t\tD\t159\r\n  Dawn Keefer\t\tR\t92\r\n  Malcolm Kenyatta\t\tD\t181\r\n  Dallas Kephart\t\tR\t73\r\n  Joe Kerwin\t\tR\t125\r\n  Tarik Khan\t\tD\t194\r\n  Patty Kim\t\tD\t103\r\n  Emily Kinkead\t\tD\t20\r\n  Stephen Kinsey\t\tD\t201\r\n  Kate Klunk\t\tR\t169\r\n  Bridget Kosierowski\t\tD\t114\r\n  Rick Krajewski\t\tD\t188\r\n  Leanne Krueger\t\tD\t161\r\n  Charity Grimm Krupa\t\tR\t51\r\n  Anita Astorino Kulik\t\tD\t45\r\n  Thomas Kutz\t\tR\t87\r\n  Andrew Kuzma\t\tR\t39\r\n  Shelby Labs\t\tR\t143\r\n  John Lawrence\t\tR\t13\r\n  Robert Leadbeter\t\tR\t109\r\n  Milou Mackenzie\t\tR\t131\r\n  Ryan Mackenzie\t\tR\t187\r\n  Maureen Madden\t\tD\t115\r\n  Dave Madsen\t\tD\t104\r\n  Abby Major\t\tR\t60\r\n  Zachary Mako\t\tR\t183\r\n  Steven Malagari\t\tD\t53\r\n  David M. Maloney Sr.\t\tR\t130\r\n  Kristin Marcell\t\tR\t178\r\n  Brandon Markosek\t\tD\t25\r\n  Jim Marshall\t\tR\t14\r\n  Robert Matzie\t\tD\t16\r\n  La'Tasha Mayes\t\tD\t24\r\n  Joe McAndrew\t\tD\t32\r\n  Joanna McClinton\t\tD\t191\r\n  Jeanne McNeill\t\tD\t133\r\n  Thomas L. Mehaffie III\t\tR\t106\r\n  Steven Mentzer\t\tR\t97\r\n  Robert Mercuri\t\tR\t28\r\n  Robert Merski\t\tD\t2\r\n  Carl Walker Metzgar\t\tR\t69\r\n  Natalie Mihalek\t\tR\t40\r\n  Brett Miller\t\tR\t41\r\n  Dan Miller\t\tD\t42\r\n  Dan Moul\t\tR\t91\r\n  Kyle Mullins\t\tD\t112\r\n  Brian Munroe\t\tD\t144\r\n  Marci Mustello\t\tR\t11\r\n  Ed Neilson\t\tD\t174\r\n  Eric Nelson\t\tR\t57\r\n  Napoleon Nelson\t\tD\t154\r\n  Jennifer O'Mara\t\tD\t165\r\n  Timothy O'Neal\t\tR\t48\r\n  Donna Oberlander\t\tR\t63\r\n  Jason Ortitay\t\tR\t46\r\n  Danielle Friel Otten\t\tD\t155\r\n  Clint Owlett\t\tR\t68\r\n  Darisha Parker\t\tD\t198\r\n  Eddie Day Pashinski\t\tD\t121\r\n  Tina Pickett\t\tR\t110\r\n  Chris Pielli\t\tD\t156\r\n  Nick Pisciottano\t\tD\t38\r\n  Tarah Probst\t\tD\t189\r\n  Christopher Rabb\t\tD\t200\r\n  Jack Rader Jr.\t\tR\t176\r\n  Kathy Rapp\t\tR\t65\r\n  Jim Rigby\t\tR\t71\r\n  Brad Roae\t\tR\t6\r\n  Leslie Rossi\t\tR\t59\r\n  David H. Rowe\t\tR\t85\r\n  Mark Rozzi\t\tD\t126\r\n  Alec Ryncavage\t\tR\t119\r\n  Abigail Salisbury\t\tD\t34\r\n  Steve Samuelson\t\tD\t135\r\n  Benjamin Sanchez\t\tD\t153\r\n  Christina Sappey\t\tD\t158\r\n  Paul Schemel\t\tR\t90\r\n  Donna Scheuren\t\tR\t147\r\n  John Schlegel\t\tR\t101\r\n  Michael Schlossberg\t\tD\t132\r\n  Louis C. Schmitt Jr.\t\tR\t79\r\n  Peter Schweyer\t\tD\t134\r\n  Stephenie Scialabba\t\tR\t12\r\n  Greg Scott\t\tD\t54\r\n  Melissa Shusterman\t\tD\t157\r\n  Joshua Siegel\t\tD\t22\r\n  Brian Smith\t\tR\t66\r\n  Ismail Smith-Wade-El\t\tD\t49\r\n  Jared Solomon\t\tD\t202\r\n  Craig Staats\t\tR\t145\r\n  Perry Stambaugh\t\tR\t86\r\n  Mandy Steele\t\tD\t33\r\n  Joanne Stehr\t\tR\t107\r\n  Michael Stender\t\tR\t108\r\n  James B. Struzzi II\t\tR\t62\r\n  P. Michael Sturla\t\tD\t96\r\n  Paul Takac\t\tD\t82\r\n  Kathleen Tomlinson\t\tR\t18\r\n  Jesse Topper\t\tR\t78\r\n  Tim Twardzik\t\tR\t123\r\n  Arvind Venkat\t\tD\t30\r\n  Greg Vitali\t\tD\t166\r\n  Ryan Warner\t\tR\t52\r\n  Perry Warren\t\tD\t31\r\n  Dane Watro\t\tR\t116\r\n  Ben Waxman\t\tD\t182\r\n  Joe Webster\t\tD\t150\r\n  Parke Wentling\t\tR\t7\r\n  Martina White\t\tR\t170\r\n  Craig Williams\t\tR\t160\r\n  Dan Williams\t\tD\t74\r\n  Regina Young\t\tD\t185\r\n  David Zimmerman\t\tR\t99\r\n  Lindsay Powell\tRepresentative Lindsay Powell - PA House of Representatives (state.pa.us)\tD\t21\r\n  `.split(\"\\n\").map(\r\n    (row) => {\r\n      const &#91;name, party, something, district] = row.trim().split(\"\\t\");\r\n\r\n      nameToDistrict&#91;name.trim()] = (district || '').trim();\r\n    }\r\n  );\r\n\r\r\n\r\n  const members = &#91;];\r\n\r\n  let $ = cheerio.load(house);\r\n  $('.MemberInfoCteeList-Member').map(\r\n    (index, element) => {\r\n      const bioElt = $(element).children('.MemberInfoCteeList-Bio');\r\n      let name = bioElt.text().trim();\r\n\r\r\n      let parts = name.split(\",\");\r\n      let lastIndex = parts.length - 1;\r\n      console.log('parts&#91;lastIndex]', parts&#91;lastIndex])\r\n      let lastComponents = parts&#91;lastIndex].trim().split(' ');\r\n\r\n      let party = lastComponents.pop();\r\n\r\n      \/\/console.log('lastPart', lastPart)\r\n      console.log('party', party)\r\n      parts&#91;lastIndex] = lastComponents.join(' ');\r\n\r\n      console.log('name', name);\r\n      console.log('parts', parts);\r\n      console.log(party);\r\n      party = party.substring(1, 2);\r\n      let first = parts&#91;0];\r\n      parts&#91;0] = parts&#91;1];\r\n      parts&#91;1] = first;\r\n\r\n      let realName = parts.join(\" \").trim();\r\n\r\n      const record = {};\r\n      record.party = party;\r\n      record.name = realName;\r\n      record.district = '';\r\n\r\n      if (nameToDistrict&#91;record.name]) {\r\n        record.district = nameToDistrict&#91;record.name];\r\n      } else {\r\n        console.log(record.name);\r\n\r\n        let name2 = record.name.replace(\/ \\w&#91;.] \/, \" \");\r\n        if (nameToDistrict&#91;name2]) {\r\n          record.district = nameToDistrict&#91;name2];\r\n        } else {\r\n          throw record.name;\r\n        }\r\n      }\r\n\r\n\r\n      let lastEntry = '';\r\n      let lastCommitee = '';\r\n\r\n      const committeesElt = $(bioElt).next().children().children().children().toArray().map(\r\n        (elt) => {\r\n          const kids = $(elt).children().toArray();\r\n          console.log(\r\n            kids.map(k => {\r\n              let role = \"Member\";\r\n\r\n              let txt = $(k).text();\r\n\r\n              if (txt.indexOf(\", \") === 0) {\r\n                txt = txt.substring(2);\r\n\r\n                record&#91;lastEntry] = txt.trim();\r\n                allCommittees&#91;lastEntry] = '';\r\n              } else {\r\n                if (txt.indexOf(\"-\") > 0) {\r\n                  let parts = txt.split(\"-\");\r\n                  txt = parts&#91;0].trim();\r\n                  role = parts&#91;1].trim();\r\n                }\r\n\r\n                if (txt.indexOf(\"Subcommittee \") &lt; 0) {\r\n                  record&#91;txt] = role.trim();\r\n\r\n                  allCommittees&#91;txt] = '';\r\n\r\n                  lastCommitee = txt.trim();\r\n                  lastEntry = txt.trim();\r\n                } else {\r\n                  record&#91;lastCommitee + \" - \" + txt] = role.trim();\r\n                  allCommittees&#91;lastCommitee + \" - \" + txt] = '';\r\n                  lastEntry = (lastCommitee + \" - \" + txt).trim();\r\n                }\r\n              }\r\n\r\n              \/\/console.log(JSON.stringify(members, null, 2));\r\n\r\n              return txt;\r\n            })\r\n          )\r\n        }\r\n      );\r\n\r\n      members.push(record);\r\n    }\r\n  )\r\n\r\n  members.map(\r\n    (member) => {\r\n      Object.keys(allCommittees).map(\r\n        (comm) => {\r\n          if (!member.hasOwnProperty(comm)) {\r\n            member&#91;comm] = '';\r\n          }\r\n        }\r\n      )\r\n    }\r\n  )\r\n\r\r\n\r\n  fs.writeFileSync('house.json', JSON.stringify(members, null, 2));\r\n  fs.writeFileSync('house.tsv', Papa.unparse(members));\r\n}\r\n<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>The PA House\/Senate contain a list of committee assignments, but there is no way to get this to an Excel format. The below script does the following: Extract the assignments as listed Mark each rep to their committees and subcommitees Link to district, party in a CSV<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[1],"tags":[302,495],"aioseo_notices":[],"amp_enabled":true,"_links":{"self":[{"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/posts\/6749"}],"collection":[{"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/comments?post=6749"}],"version-history":[{"count":1,"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/posts\/6749\/revisions"}],"predecessor-version":[{"id":6750,"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/posts\/6749\/revisions\/6750"}],"wp:attachment":[{"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/media?parent=6749"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/categories?post=6749"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.garysieling.com\/blog\/wp-json\/wp\/v2\/tags?post=6749"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}