{"id":2797,"date":"2020-11-16T21:35:02","date_gmt":"2020-11-16T20:35:02","guid":{"rendered":"http:\/\/www.dbonline.se\/?p=2797"},"modified":"2020-11-16T21:40:14","modified_gmt":"2020-11-16T20:40:14","slug":"pdf-avlasning-med-python","status":"publish","type":"post","link":"https:\/\/www.dbonline.se\/index.php\/2020\/11\/16\/pdf-avlasning-med-python\/","title":{"rendered":"PDF avl\u00e4sning med python"},"content":{"rendered":"\n<figure class=\"wp-block-image size-large is-resized\"><img data-recalc-dims=\"1\" loading=\"lazy\" decoding=\"async\" data-attachment-id=\"2799\" data-permalink=\"https:\/\/www.dbonline.se\/index.php\/2020\/11\/16\/pdf-avlasning-med-python\/opengraph-icon-200x200\/\" data-orig-file=\"https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?fit=200%2C200&amp;ssl=1\" data-orig-size=\"200,200\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"opengraph-icon-200&#215;200\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?fit=200%2C200&amp;ssl=1\" data-large-file=\"https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?fit=200%2C200&amp;ssl=1\" src=\"https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?resize=89%2C89\" alt=\"\" class=\"wp-image-2799\" width=\"89\" height=\"89\" srcset=\"https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?w=200&amp;ssl=1 200w, https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?resize=150%2C150&amp;ssl=1 150w\" sizes=\"auto, (max-width: 89px) 100vw, 89px\" \/><\/figure>\n\n\n\n<p>Har ni ett gammalt eller ett ekonomisystem som \u00e4r sv\u00e5rt att integrera?<br>Nedan finns kod f\u00f6r att l\u00e4sa av inneh\u00e5llet i en PDF. Detta kommer inte fungera rakt av f\u00f6r din l\u00f6sning, men kan ge inspiration.<\/p>\n\n\n\n<p>Om du beh\u00f6ver hj\u00e4lp s\u00e5 finns mina kontaktuppgifter p\u00e5 den h\u00e4r sidan eller Linkedin.<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#Created by Karl Sj\u00f6kvist\n\nfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter\nfrom pdfminer.converter import TextConverter, XMLConverter, HTMLConverter\nfrom pdfminer.layout import LAParams\nfrom pdfminer.pdfpage import PDFPage\nfrom io import BytesIO\n\nimport os\nimport glob\nimport xml.etree.ElementTree as ET\n\n\ndef convert_pdf(path, format='xml', codec='utf-8', password=''):\n    rsrcmgr = PDFResourceManager()\n    retstr = BytesIO()\n    laparams = LAParams()\n    if format == 'text':\n        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)\n    elif format == 'html':\n        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)\n    elif format == 'xml':\n        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)\n    else:\n        raise ValueError('provide format, either text, html or xml!')\n    fp = open(path, 'rb')\n    interpreter = PDFPageInterpreter(rsrcmgr, device)\n    maxpages = 0\n    caching = True\n    pagenos=set()\n    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):\n        interpreter.process_page(page)\n\n    text = retstr.getvalue().decode()\n    fp.close()\n    device.close()\n    retstr.close()\n    return text\n\n#Analyserar och flyttar de analyserade filerna till klara efter att en xml har skapats av inneh\u00e5llet\n\nPATH = \"C:\/Users\/karl.sjokvist\/Desktop\/read pdf\/\"\n\nif not os.path.exists('klara'):\n    os.makedirs('klara')\n\npdffiles = (glob.glob(PATH + \"*.pdf\")) #Listar alla filer\nfor pdf in pdffiles:\n    text = convert_pdf(pdf) #Anropet till konverteringen\n    filename = pdf.split(\"\\\\\")\n    filename = filename&#91;1].split(\".\")\n    filename = filename&#91;0]\n    print (\"p\u00e5b\u00f6rjar analys av \" + filename)\n    f = open(filename + \".xml\", \"w\")\n    text = text.replace(\"utf-8\", \"ISO-8859-1\")\n    text+=\"&lt;\/pages>\" #Var tvungen att l\u00e4gga till detta d\u00e5 det finns en bugg\n    f.write(text)\n    f.close()\n    \n    os.replace(pdf, PATH+\"klara\/\" + filename + \".pdf\") #Flyttar filen\n\n    #XML query f\u00f6r att f\u00e5 fram datan\n    tree = ET.parse(filename + \".xml\")\n    root = tree.getroot()\n\n    raknadesidor = 0\n    sidor = 1\n\n    #R\u00e4knar sidor i PDFn\n    for value in root.iter('page'):\n        raknadesidor+=1\n\n    #G\u00e5r genom alla sidor enskillt\n    while sidor &lt;= raknadesidor:\n        print (\"Sida\" + str(sidor))\n        for value in root.iter('page'):\n            if value.attrib&#91;'id'] == str(sidor): #Sidorna p\u00e5 PDFen\n    \n                #F\u00f6r att f\u00e5 fram alla kostnadsrader\n                #Artikelrader\n                kostnadsrader = &#91;]\n                for subvalue in value.iter('textline'):\n                    if subvalue.attrib&#91;'bbox'].startswith(\"53.150\"): #Kordinaterna p\u00e5 PDFen\n                        textstring = \"\"\n                        for text in subvalue.iter('text'): #Sl\u00e5r ihop indeviduella tecken till rader\n                            textstring+=text.text\n                        textstring = textstring&#91;:-1]#Tar bort nya radbytet\n                        kostnadsrader.append(textstring)\n                firstindex = kostnadsrader.index(\"Artikel\") #F\u00f6rsta radindex\n                secondindex = &#91;i for i in kostnadsrader if i.startswith('Moms')]\n                secondindex = kostnadsrader.index(secondindex&#91;0])\n                kostnadsrader = kostnadsrader&#91;firstindex + 1:secondindex] #tar bort allt innan och efter index\n\n                print (kostnadsrader)\n\n                #Counts the numbers of type rows i.e. rows that has numbers\n                posoffirstnum = None\n                countrows = 0\n                for i in kostnadsrader:\n                    if i&#91;0].isnumeric():\n                        if i&#91;-1].isnumeric():\n                            if posoffirstnum == None:\n                                posoffirstnum = countrows\n                    countrows+=1\n\n                #Ben\u00e4mning\n                benamningsrader = &#91;]\n                for subvalue in value.iter('textline'):\n                    if subvalue.attrib&#91;'bbox'].startswith(\"130.400\"): #Kordinaterna p\u00e5 PDFen\n                        textstring = \"\"\n                        for text in subvalue.iter('text'): #Sl\u00e5r ihop indeviduella tecken till rader\n                            textstring+=text.text\n                        textstring = textstring&#91;:-1]#Tar bort nya radbytet\n                        benamningsrader.append(textstring)\n                firstindex = benamningsrader.index(\"Ben\u00e4mning\") #F\u00f6rsta radindex\n                benamningsrader = benamningsrader&#91;firstindex + 1:] #tar bort allt ib\u00f6rjan och slutet av array\n\n                print (benamningsrader)\n            \n                #antal\n                antalrader = &#91;]\n                for subvalue in value.iter('textline'):\n                    if subvalue.attrib&#91;'bbox'].startswith(\"360.200\") or subvalue.attrib&#91;'bbox'].startswith(\"360.650\") or subvalue.attrib&#91;'bbox'].startswith(\"365.150\") or subvalue.attrib&#91;'bbox'].startswith(\"364.700\"): #Kordinaterna p\u00e5 PDFen\n                        textstring = \"\"\n                        for text in subvalue.iter('text'): #Sl\u00e5r ihop indeviduella tecken till rader\n                            textstring+=text.text\n                        textstring = textstring&#91;:-1]#Tar bort nya radbytet\n                        textstring = textstring.replace(\" St\",\"\");\n                        textstring = textstring.replace(\" \",\"\");\n                        antalrader.append(textstring)\n\n                print (antalrader)\n            \n                #enhet\n                enheter = &#91;]\n                for subvalue in value.iter('textline'):\n                    if subvalue.attrib&#91;'bbox'].startswith(\"365.150\") or subvalue.attrib&#91;'bbox'].startswith(\"391.400\") or subvalue.attrib&#91;'bbox'].startswith(\"360.650\") or subvalue.attrib&#91;'bbox'].startswith(\"360.200\"): #Kordinaterna p\u00e5 PDFen\n                        textstring = \"\"\n                        for text in subvalue.iter('text'): #Sl\u00e5r ihop indeviduella tecken till rader\n                            textstring+=text.text\n                        textstring = textstring&#91;:-1]#Tar bort nya radbytet\n                        enheter.append(textstring)\n                \n                #Rensar on\u00f6dig informaton fr\u00e5n enhet\n                enheterloop = 0\n                while enheterloop &lt; len(enheter):\n                    if \"st\" in enheter&#91;enheterloop]:\n                        enheter&#91;enheterloop] = \"st\"\n                    if \"tim\" in enheter&#91;enheterloop]:\n                        enheter&#91;enheterloop] = \"tim\"\n                    if \"skif\" in enheter&#91;enheterloop]:\n                        enheter&#91;enheterloop] = \"skif\"\n                    if \"St\" in enheter&#91;enheterloop]:\n                        enheter&#91;enheterloop] = \"St\"\n                    if enheter&#91;enheterloop] != \"st\" and enheter&#91;enheterloop] != \"tim\" and enheter&#91;enheterloop] != \"skif\" and enheter&#91;enheterloop] != \"St\":\n                        del enheter&#91;enheterloop]\n                        if len(enheter) >=1:\n                            enheterloop-=1\n                    enheterloop+=1\n\n                print (enheter)\n            \n                #pris\n                prisrader = &#91;]\n                for subvalue in value.iter('textline'):\n                    if subvalue.attrib&#91;'bbox'].startswith(\"417.150\") or subvalue.attrib&#91;'bbox'].startswith(\"421.650\") or subvalue.attrib&#91;'bbox'].startswith(\"426.150\"): #Kordinaterna p\u00e5 PDFen\n                        textstring = \"\"\n                        for text in subvalue.iter('text'): #Sl\u00e5r ihop indeviduella tecken till rader\n                            textstring+=text.text\n                        textstring = textstring&#91;:-1]#Tar bort nya radbytet\n                        prisrader.append(textstring)\n                print (prisrader)\n\n                #skriva ut raderna\n                rader = 0\n                offset = 0\n                print (\"Skriver ut raderna\")\n                while rader &lt; len(kostnadsrader):\n                    rad = kostnadsrader&#91;rader]\n                    if rader >= posoffirstnum:\n                        if offset &lt; len(benamningsrader):\n                            rad = rad + \" \" + benamningsrader&#91;offset] + \" \" + antalrader&#91;offset] + \" \" + enheter&#91;offset] + \" \" + prisrader&#91;offset]\n                            offset+=1\n                    rader+=1\n                    print (rad)\n                print (\"klar med analys\\n\")\n        sidor+=1\n        \n<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>Har ni ett gammalt eller ett ekonomisystem som \u00e4r sv\u00e5rt att integrera?Nedan finns kod f\u00f6r att l\u00e4sa av inneh\u00e5llet i en PDF. Detta kommer inte fungera rakt av f\u00f6r din l\u00f6sning, men kan ge inspiration. Om du beh\u00f6ver hj\u00e4lp s\u00e5<\/p>\n","protected":false},"author":1,"featured_media":2799,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"jetpack_post_was_ever_published":false,"_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":false,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":true,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"enabled":false},"version":2}},"categories":[1],"tags":[],"class_list":["post-2797","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-uncategorized"],"jetpack_publicize_connections":[],"jetpack_featured_media_url":"https:\/\/i0.wp.com\/www.dbonline.se\/wp-content\/uploads\/2020\/11\/opengraph-icon-200x200-1.png?fit=200%2C200&ssl=1","jetpack_sharing_enabled":true,"jetpack_shortlink":"https:\/\/wp.me\/p8vDvB-J7","jetpack-related-posts":[],"_links":{"self":[{"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/posts\/2797","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/comments?post=2797"}],"version-history":[{"count":2,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/posts\/2797\/revisions"}],"predecessor-version":[{"id":2800,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/posts\/2797\/revisions\/2800"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/media\/2799"}],"wp:attachment":[{"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/media?parent=2797"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/categories?post=2797"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.dbonline.se\/index.php\/wp-json\/wp\/v2\/tags?post=2797"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}